diff --git a/.gitignore b/.gitignore
index c512620e..8a394a9a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,4 +164,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
-/docs/apidocs/
+
+pipelines/deprecated_pipelines
+
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
index 91d0b1ba..53c1b295 100644
--- a/deeprvat/annotations/annotations.py
+++ b/deeprvat/annotations/annotations.py
@@ -6,7 +6,7 @@
import time
from pathlib import Path
from typing import Optional
-
+import dask.dataframe as dd
import numpy as np
import click
import keras.backend as K
@@ -16,10 +16,33 @@
from joblib import Parallel, delayed
from keras.models import load_model
from sklearn.decomposition import PCA
-from tqdm import tqdm
+from tqdm import tqdm, trange
+from fastparquet import ParquetFile
+import yaml
def precision(y_true, y_pred):
+ """
+ Calculate precision, a metric for the accuracy of the positive predictions.
+
+ Precision is defined as the the fraction of relevant instances among the retrieved instances.
+
+ Parameters:
+ - y_true (Tensor): The true labels (ground truth).
+ - y_pred (Tensor): The predicted labels.
+
+ Returns:
+ float: Precision value.
+
+ Notes:
+ - This function uses the Keras backend functions to perform calculations.
+ - Precision is calculated as `true_positives / (predicted_positives + epsilon)`, where epsilon is a small constant to avoid division by zero.
+
+
+ References:
+ - https://en.wikipedia.org/wiki/Precision_and_recall
+
+ """
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
@@ -28,6 +51,26 @@ def precision(y_true, y_pred):
def recall(y_true, y_pred):
+ """
+ Calculate recall, a metric for the ability to capture true positive instances.
+
+ Recall is defined as the fraction of relevant instances that were retrieved.
+
+ Parameters:
+ - y_true (Tensor): The true labels (ground truth).
+ - y_pred (Tensor): The predicted labels.
+
+ Returns:
+ - float: Recall value.
+
+ Notes:
+ - This function uses the Keras backend functions to perform calculations.
+ - Recall is calculated as `true_positives / (possible_positives + epsilon)`, where epsilon is a small constant to avoid division by zero.
+
+
+ References:
+ - https://en.wikipedia.org/wiki/Precision_and_recall
+ """
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
@@ -36,6 +79,28 @@ def recall(y_true, y_pred):
def deepripe_get_model_info(saved_models_dict, saved_deepripe_models_path):
+ """
+ Retrieve information about the paths and names of saved deepRiPe models.
+
+ Parameters:
+ - saved_models_dict (dict): A dictionary containing keys for different types of models. Keys include "parclip" for PAR-CLIP models, "eclip_hg2" for eCLIP models in HepG2, and "eclip_k5" for eCLIP models in K562. Values are model identifiers.
+ - saved_deepripe_models_path (str): The path to the directory where the deepRiPe models are saved.
+
+ Returns:
+ tuple: A tuple containing two dictionaries.
+ The first dictionary contains paths for each type of model, with keys
+ "parclip", "eclip_hg2", and "eclip_k5" and values as lists of paths corresponding to high,
+ medium, and low sequence models.
+ The second dictionary contains lists of RBP names for each type of model, with keys
+ "parclip", "eclip_hg2", and "eclip_k5" and values as lists of RBP names for high, medium, and
+ low sequence models.
+
+ Notes:
+ - The function constructs file paths based on the provided model identifiers.
+ - The resulting dictionary structure allows easy access to model paths for different types.
+
+
+ """
shared_path = Path(saved_deepripe_models_path)
# parclip
@@ -106,7 +171,7 @@ def deepripe_get_model_info(saved_models_dict, saved_deepripe_models_path):
"G45",
"XPO5",
]
- )
+ ) # 27
pc_RBPnames_med = np.array(
[
@@ -358,7 +423,27 @@ def deepripe_get_model_info(saved_models_dict, saved_deepripe_models_path):
def seq_to_1hot(seq, randomsel=True):
- "converts the sequence to one-hot encoding"
+ """
+ Convert a nucleotide sequence to one-hot encoding.
+
+ Parameters:
+ - seq (str): The input nucleotide sequence.
+ - randomsel (bool): If True, treat ambiguous base as random base.
+ If False, return only zero rows for ambiguous case.
+
+ Returns:
+ numpy.ndarray: A 2D array representing the one-hot encoding of the input sequence.
+ Rows correspond to nucleotides 'A', 'C', 'G', 'T' in that order.
+ Columns correspond to positions in the input sequence.
+
+ Notes:
+ - Ambiguous bases are handled based on the 'randomsel' parameter.
+
+
+
+ References:
+ - one-hot encoding: https://en.wikipedia.org/wiki/One-hot
+ """
seq_len = len(seq)
seq = seq.upper()
@@ -380,14 +465,29 @@ def seq_to_1hot(seq, randomsel=True):
def convert2bed(variants_file, output_dir):
+ """
+ Convert a variants file to BED format.
+
+ Parameters:
+ - variants_file (str): The path to the input variants file.
+ - output_dir (str): The directory where the BED file will be saved.
+
+ Returns:
+ None
+
+ Notes:
+ - The input variants file should be in tab-separated format with columns: "#CHROM", "POS", "ID", "REF", "ALT".
+ - The generated BED file will have columns: "CHR", "Start", "End", "ID", "VAR", "Strand".
+ - The "Start" and "End" columns are set to the "POS" values, and "Strand" is set to '.' for all entries.
+ """
file_name = variants_file.split("/")[-1]
- logger.info(f"Generating BED file: {output_dir}/{file_name[:-3]}bed")
+ print(f"Generating BED file: {output_dir}/{file_name[:-3]}bed")
df_variants = pd.read_csv(
variants_file, sep="\t", names=["#CHROM", "POS", "ID", "REF", "ALT"]
- ) # hg38
+ )
- logger.debug(df_variants.head())
+ print(df_variants.head())
df_bed = pd.DataFrame()
df_bed["CHR"] = df_variants["#CHROM"].astype(str)
@@ -403,6 +503,27 @@ def convert2bed(variants_file, output_dir):
def deepripe_encode_variant_bedline(bedline, genomefasta, flank_size=75):
+ """
+ Encode a variant bedline into one-hot encoded sequences.
+
+ Parameters:
+ - bedline (list): A list representing a variant bedline, containing elements for chromosome, start position, end position, reference allele, alternate allele, and strand.
+ - genomefasta (str): The path to the genome FASTA file for sequence retrieval.
+ - flank_size (int): The size of flanking regions to include in the sequence around the variant position.
+
+ Returns:
+ numpy.ndarray: A 3D array representing one-hot encoded sequences. The dimensions are (num_sequences, sequence_length, nucleotide_channels).
+
+ Notes:
+ - The input bedline should follow the format: [chromosome, start position, end position, reference allele, alternate allele, strand].
+ - The function retrieves the wild-type and mutant sequences flanked by the specified size.
+ - The wild-type sequence is extracted from the genome FASTA file and mutated at the variant position.
+ - The resulting sequences are one-hot encoded and returned as a numpy array.
+
+ References:
+ - pybedtools.BedTool: https://daler.github.io/pybedtools/main.html
+ - FATSA format: https://en.wikipedia.org/wiki/FASTA_format
+ """
mut_a = bedline[4].split("/")[1]
strand = bedline[5]
if len(mut_a) == 1:
@@ -440,39 +561,22 @@ def deepripe_encode_variant_bedline(bedline, genomefasta, flank_size=75):
return encoded_seqs
-def deepripe_score_variant_onlyseq_all(
- model_group, variant_bed, genomefasta, seq_len=200, n_jobs=32
-):
- predictions = {}
- encoded_seqs_list = Parallel(n_jobs=n_jobs, verbose=10)(
- delayed(deepripe_encode_variant_bedline)(
- bedline, genomefasta, flank_size=(seq_len // 2) + 2
- )
- for bedline in variant_bed
- )
- encoded_seqs_list = [
- (x if x is not None else np.ones((2, seq_len + 4, 4)) * float("nan"))
- for x in encoded_seqs_list
- ]
- encoded_seqs = tf.concat(encoded_seqs_list, 0)
+def readYamlColumns(annotation_columns_yaml_file):
+ with open(annotation_columns_yaml_file, "r") as fd:
+ config = yaml.safe_load(fd)
+ columns = config["annotation_column_names"]
+ prior_names = list(columns.keys())
+ post_names = [list(columns[k].keys())[0] for k in columns]
+ fill_vals = [list(columns[k].values())[0] for k in columns]
+ column_name_mapping = dict(zip(prior_names, post_names))
+ fill_value_mapping = dict(zip(post_names, fill_vals))
+ return prior_names, post_names, fill_vals, column_name_mapping, fill_value_mapping
- logger.info("Computing predictions")
- ## shifting around (seq_len+4) 4 bases
- for choice in tqdm(model_group.keys(), desc="Model group"):
- avg_score = 0.0
- for i in range(4):
- cropped_seqs = encoded_seqs[:, i : i + seq_len, :]
- model, _ = model_group[choice]
- pred = model.predict_on_batch(cropped_seqs)
- wild_indices = tf.range(pred.shape[0], delta=2)
- mut_indices = tf.range(1, pred.shape[0], delta=2)
- pred_wild = pred[wild_indices, :]
- pred_mut = pred[mut_indices, :]
- score = pred_mut - pred_wild
- avg_score += score
- predictions[choice] = avg_score / 4
- return predictions
+def get_parquet_columns(parquet_file):
+ pfile = ParquetFile(parquet_file)
+ pcols = pfile.columns
+ return pcols
@click.group()
@@ -481,22 +585,177 @@ def cli():
@cli.command()
-@click.option("--n-components", type=int, default=100)
+@click.argument("anno_path", type=click.Path(exists=True))
+@click.argument("gtf_path", type=click.Path(exists=True))
+@click.argument("genes_path", type=click.Path(exists=True))
+@click.argument("output_path", type=click.Path(exists=False))
+@click.option("--max_dist", type=int, default=300)
+def filter_annotations_by_exon_distance(
+ anno_path: str, gtf_path: str, genes_path: str, output_path: str, max_dist: int
+) -> None:
+ """
+ Filters annotation based on distance to the nearest exon of gene it is associated with.
+
+ Args:
+ anno_path (str): Annotation parquet file containing variant annotations to filter.
+ gtf_path (str): GTF file containing start and end positions of all relevant exons of all relevant genes. DataFrame is filtered for protein coding exons.
+ genes_path (str): List of protein coding genes and their IDs in the annotation DataFrame.
+ output_path (str): Where to write the resulting parquet file.
+ max_dist (int): Base pairs used to filter.
+
+ Returns:
+ None
+
+ Writes:
+ Parquet file containing filtered annotations.
+ """
+ import pyranges as pr
+
+ logger.info("read gtf file as pandas df")
+ gtf = pr.read_gtf(gtf_path)
+ gtf = gtf.as_df()
+
+ logger.info("filter gtf for protein coding exons from the HAVANA DB")
+ gtf = gtf.query(
+ "Source == 'HAVANA' and Feature == 'exon' and gene_type == 'protein_coding' and transcript_type == 'protein_coding'"
+ )
+
+ logger.info("split gene ID column on '.'")
+ gtf[["gene_base", "feature"]] = gtf["gene_id"].str.split(".", expand=True)
+
+ logger.info(" read protein_coding_genes")
+ pcg = pd.read_parquet(genes_path, columns=["gene", "id"])
+
+ logger.info(" only select necessary columns, rename to fit gtf file")
+ gtf = gtf[["gene_id", "Start", "End"]].rename(columns={"gene_id": "gene"})
+
+ logger.info(" add gene ids to gtf file")
+
+ gtf = gtf.merge(pcg, on="gene")
+
+ logger.info(" only select necessary columns, rename to fit gtf file")
+ gtf = gtf[["Start", "End", "id"]].rename(columns={"id": "gene_id"})
+
+ logger.info("reading annotations to filter ")
+ anno_df = pd.read_parquet(anno_path)
+ anno_df = anno_df[["id", "pos", "gene_id"]]
+
+ logger.info("adding exons to annotations (1:M merge)")
+
+ merged = anno_df.merge(gtf, how="left", on="gene_id")
+ del anno_df
+
+ logger.info(
+ "adding positons of start and end of each exon relative to variant position to df"
+ )
+ merged["start_diff"] = merged["Start"] - merged["pos"]
+ merged["end_diff"] = merged["End"] - merged["pos"]
+
+ logger.info(
+ f"filtering all rows that are further than {max_dist}bp away from each exon "
+ )
+ len_bf_filtering = len(merged)
+ filtered_merge = merged.query(
+ "(start_diff <= 0 & end_diff >= 0) | abs(start_diff) <= @max_dist | abs(end_diff) <= @max_dist"
+ )
+ del merged
+ len_after_filtering = len(filtered_merge)
+ logger.info(
+ f"filtered rows by exon distance ({max_dist}bp), dropped({len_bf_filtering - len_after_filtering} rows / {np.round(100*(len_bf_filtering - len_after_filtering)/len_bf_filtering)}%)"
+ )
+
+ logger.info("select necessary columns, drop duplicates")
+ filtered_merge = filtered_merge[["id", "gene_id"]]
+ filtered_merge = filtered_merge.drop_duplicates()
+ logger.info(
+ f"dropped dublicates in data frame (dropped {len_after_filtering - len(filtered_merge)}rows/ {np.round(100*(len_after_filtering - len(filtered_merge))/len_after_filtering)}%)."
+ )
+
+ logger.info("Reading in annotations for filtering")
+ anno_df = pd.read_parquet(anno_path)
+ len_anno = len(anno_df)
+ filtered = anno_df.merge(filtered_merge, on=["id", "gene_id"], how="left")
+
+ logger.info(
+ f"filtered annotations based on filterd id, gene_id (dropped {len(anno_df) - len(filtered)} / {np.round(100*(len(anno_df)-len(filtered))/len(anno_df))}% of rows)."
+ )
+ logger.info("performing sanity check")
+ assert len(filtered == len_anno)
+ logger.info(f"writing result to {output_path}")
+ filtered.to_parquet(output_path)
+
+
+@cli.command()
@click.argument("deepsea-file", type=click.Path(exists=True))
@click.argument("pca-object", type=click.Path())
+@click.argument("means_sd_df", type=click.Path())
@click.argument("out-dir", type=click.Path(exists=True))
-def deepsea_pca(n_components: int, deepsea_file: str, pca_object: str, out_dir: str):
+@click.option("--n-components", type=int, default=100)
+def deepsea_pca(
+ deepsea_file: str,
+ pca_object: str,
+ means_sd_df: str,
+ out_dir: str,
+ n_components: int,
+):
+ """
+ Perform Principal Component Analysis (PCA) on DeepSEA data and save the results.
+
+ Parameters:
+ - n_components (int): Number of principal components to retain, default is 100.
+ - deepsea_file (str): Path to the DeepSEA data in parquet format.
+ - pca_object (str): Path to save or load the PCA object (components) in npy or pickle format.
+ - means_sd_df (str): Path to a DataFrame containing pre-calculated means and SDs for standardization. If path does not exist, standardization will be done using the calculated mean and SD, result will then be saved under this path
+ - out_dir (str): Path to the output directory where the PCA results will be saved.
+
+ Returns:
+ None
+
+ Raises:
+ AssertionError: If there are NaN values in the PCA results DataFrame.
+
+ Notes:
+ - If 'means_sd_df' is provided, the data will be standardized using the existing mean and SD. Otherwise, the data will be standardized using the mean and SD calculated from the data.
+ - If 'pca_object' exists, it will be loaded as a PCA object. If it doesn't exist, a new PCA object will be created, and its components will be saved to 'pca_object'.
+
+ Example:
+ $ python annotations.py deepsea_pca --n-components 50 deepsea_data.parquet pca_components.npy means_sd.parquet results/
+ """
logger.info("Loading deepSea data")
- df = pd.read_csv(deepsea_file)
+ df = pd.read_parquet(deepsea_file)
logger.info("filling NAs")
df = df.fillna(0)
logger.info("Extracting matrix for PCA")
- key_df = df[["chrom", "pos", "ref", "alt", "id"]].reset_index(drop=True)
+ key_df = df[["#CHROM", "POS", "REF", "ALT"]].reset_index(drop=True)
logger.info("transforming values to numpy")
- X = df[[c for c in df.columns if c.startswith("DeepSEA")]].to_numpy()
+ deepSEAcols = [c for c in df.columns if c.startswith("DeepSEA")]
+ X = df[deepSEAcols].to_numpy()
del df
- logger.info("standardizing values")
- X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
+ logger.info(
+ "checking wether input contains data frame with pre-calculated means and SDs"
+ )
+ if os.path.exists(means_sd_df):
+ logger.info("standardizing values using existing mean and SD")
+ means_sd_data = pd.read_parquet(means_sd_df)
+
+ means = means_sd_data["means"].to_numpy()
+ sds = means_sd_data["SDs"].to_numpy()
+ del means_sd_data
+ X_std = (X - means) / sds
+ del means
+ del sds
+
+ else:
+ logger.info("standardizing values")
+ X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
+ means_sd_data = pd.DataFrame(
+ {
+ "names": deepSEAcols,
+ "means": np.mean(X, axis=0),
+ "SDs": np.std(X, axis=0),
+ }
+ )
+ means_sd_data.to_parquet(means_sd_df)
del X
out_path = Path(out_dir)
@@ -504,7 +763,7 @@ def deepsea_pca(n_components: int, deepsea_file: str, pca_object: str, out_dir:
if os.path.exists(pca_object):
if ".pkl" in pca_object:
with open(pca_object, "rb") as pickle_file:
- logger.info("loading pca objects pickle file")
+ logger.info("loading pca objectas pickle file")
pca = pickle.load(pickle_file)
X_pca = pca.transform(X_std)
else:
@@ -559,6 +818,31 @@ def scorevariants_deepripe(
n_jobs: int,
saved_model_type: str = "parclip",
):
+ """
+ Score variants using deep learning models trained on PAR-CLIP and eCLIP data.
+
+ Parameters:
+ - variants_file (str): Path to the file containing variant information to be annotated.
+ - output_dir (str): Path to the output directory where the results will be saved.
+ - genomefasta (str): Path to the reference genome in FASTA format.
+ - pybedtools_tmp_dir (str): Path to the temporary directory for pybedtools.
+ - saved_deepripe_models_path (str): Path to the directory containing saved deepRiPe models.
+ - n_jobs (int): Number of parallel jobs for scoring variants.
+ - saved_model_type (str, optional): Type of the saved deepRiPe model to use (parclip, eclip_hg2, eclip_k5). Default is "parclip".
+
+ Returns:
+ None
+
+ Raises:
+ AssertionError: If there are NaN values in the generated DataFrame.
+
+ Notes:
+ - This function scores variants using deepRiPe models trained on different CLIP-seq datasets.
+ - The results are saved as a CSV file in the specified output directory.
+
+ Example:
+ $ python annotations.py scorevariants_deepripe variants.csv results/ reference.fasta tmp_dir/ saved_models/ 8 eclip_k5
+ """
file_name = variants_file.split("/")[-1]
bed_file = f"{output_dir}/{file_name[:-3]}bed"
@@ -582,7 +866,7 @@ def scorevariants_deepripe(
convert2bed(variants_file, output_dir)
variant_bed = pybedtools.BedTool(bed_file)
- logger.info(f"Scoring variants for: {bed_file}")
+ print(f"Scoring variants for: {bed_file}")
### paths for experiments
saved_models_dict = {
@@ -651,16 +935,16 @@ def scorevariants_deepripe(
)
for choice in current_model_type.keys():
- logger.debug(choice)
+ print(choice)
_, RBPnames = current_model_type[choice]
score_list = predictions[choice]
score_list = np.asarray(score_list)
- logger.info(f"Output size: {score_list.shape}")
+ print(f"Output size: {score_list.shape}")
### write predictions to df
for ix, RBP_name in enumerate(RBPnames):
df_variants[RBP_name] = score_list[:, ix]
- logger.info(
+ print(
f"saving file to: {output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv.gz"
)
df_variants.to_csv(
@@ -675,8 +959,32 @@ def process_chunk(
tissue_agg_function,
ca_shortened,
):
+ """
+ Process a chunk of data from absplice site results and merge it with the remaining annotation data.
+
+ Parameters:
+ - chrom_file (str): The filename for the chunk of absplice site results.
+ - abs_splice_res_dir (Path): The directory containing the absplice site results.
+ - tissues_to_exclude (list): List of tissues to exclude from the absplice site results.
+ - tissue_agg_function (str): The aggregation function to use for tissue-specific AbSplice scores.
+ - ca_shortened (DataFrame): The remaining annotation data to merge with the absplice site results.
+
+ Returns:
+ DataFrame: Merged DataFrame containing aggregated tissue-specific AbSplice scores and remaining annotation data.
+
+ Notes:
+ - The function reads the absplice site results for a specific chromosome, excludes specified tissues, and aggregates AbSplice scores using the specified tissue aggregation function.
+ - The resulting DataFrame is merged with the remaining annotation data based on the chromosome, position, reference allele, alternative allele, and gene ID.
+
+ Example:
+ merged_data = process_chunk("chr1_results.csv", Path("abs_splice_results/"), ["Brain", "Heart"], "max", ca_shortened_df)
+ """
logger.info(f"Reading file {chrom_file}")
- ab_splice_res = pd.read_csv(abs_splice_res_dir / chrom_file).reset_index()
+
+ ab_splice_res = pd.read_csv(
+ abs_splice_res_dir / chrom_file, engine="pyarrow"
+ ).reset_index()
+
ab_splice_res = ab_splice_res.query("tissue not in @tissues_to_exclude")
logger.info(
f"AbSplice tissues excluded: {tissues_to_exclude}, Aggregating AbSplice scores using {tissue_agg_function}"
@@ -706,24 +1014,43 @@ def process_chunk(
f"Number of unique variants(variant) in merged {len(merged['variant'].unique())}"
)
- del ab_splice_res
-
return merged
+ del merged
+ del ab_splice_res
+
@cli.command()
@click.argument("current_annotation_file", type=click.Path(exists=True))
@click.argument("abs_splice_res_dir", type=click.Path(exists=True))
-@click.argument("out_file", type=click.Path())
@click.argument("absplice_score_file", type=click.Path())
@click.argument("njobs", type=int)
-def get_abscores(
+def aggregate_abscores(
current_annotation_file: str,
abs_splice_res_dir: str,
- out_file: str,
absplice_score_file: str,
njobs: int,
):
+ """
+ Aggregate AbSplice scores from AbSplice results and save the results.
+
+ Parameters:
+ - current_annotation_file (str): Path to the current annotation file in parquet format.
+ - abs_splice_res_dir (str): Path to the directory containing AbSplice results.
+ - absplice_score_file (str): Path to save the aggregated AbSplice scores in parquet format.
+ - njobs (int): Number of parallel jobs for processing AbSplice results.
+
+ Returns:
+ None
+
+ Notes:
+ - The function reads the current annotation file and extracts necessary information for merging.
+ - It then processes AbSplice results in parallel chunks, aggregating AbSplice scores.
+ - The aggregated scores are saved to the specified file.
+
+ Example:
+ $ python annotations.py aggregate_abscores annotations.parquet abs_splice_results/ absplice_scores.parquet 4
+ """
current_annotation_file = Path(current_annotation_file)
logger.info("reading current annotations file")
current_annotations = pd.read_parquet(current_annotation_file)
@@ -734,52 +1061,152 @@ def get_abscores(
current_annotations = current_annotations.rename(
columns={"AbSplice_DNA": "AbSplice_DNA_old"}
)
- ca_shortened = current_annotations[["id", "gene_id", "chrom", "pos", "ref", "alt"]]
+ ca_shortened = current_annotations[["id", "Gene", "chrom", "pos", "ref", "alt"]]
+ ca_shortened = ca_shortened.rename(columns={"Gene": "gene_id"})
logger.info(ca_shortened.columns)
abs_splice_res_dir = Path(abs_splice_res_dir)
tissue_agg_function = "max"
+ tissues_to_exclude = ["Testis"]
tissues_to_exclude = []
ab_splice_agg_score_file = absplice_score_file
- if not Path(ab_splice_agg_score_file).exists():
- logger.info("creating abSplice score file.. ")
-
- parallel = Parallel(n_jobs=njobs, return_as="generator")
- output_generator = parallel(
- delayed(process_chunk)(
- i,
- abs_splice_res_dir,
- tissues_to_exclude,
- tissue_agg_function,
- ca_shortened,
- )
- for i in tqdm(os.listdir(abs_splice_res_dir))
+ logger.info("creating abSplice score file.. ")
+ all_absplice_scores = []
+ parallel = Parallel(n_jobs=njobs, return_as="generator", verbose=50)
+ output_generator = parallel(
+ delayed(process_chunk)(
+ i, abs_splice_res_dir, tissues_to_exclude, tissue_agg_function, ca_shortened
+ )
+ for i in tqdm(os.listdir(abs_splice_res_dir))
+ )
+ all_absplice_scores = list(output_generator)
+
+ logger.info("concatenating files")
+ all_absplice_scores = pd.concat(all_absplice_scores)
+ logger.info(f"saving score file to {ab_splice_agg_score_file}")
+ all_absplice_scores.to_parquet(ab_splice_agg_score_file)
+
+
+logging.basicConfig(
+ format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
+ level="INFO",
+ stream=sys.stdout,
+)
+logger = logging.getLogger(__name__)
+
+
+def deepripe_score_variant_onlyseq_all(
+ model_group, variant_bed, genomefasta, seq_len=200, batch_size=1024, n_jobs=32
+):
+ """
+ Compute variant scores using a deep learning model for each specified variant.
+
+ Parameters:
+ - model_group (dict): A dictionary containing deep learning models for different choices. Each entry should be a key-value pair, where the key is the choice name and the value is a tuple containing the model and additional information.
+ - variant_bed (list): A list of variant bedlines, where each bedline represents a variant.
+ - genomefasta (str): Path to the reference genome in FASTA format.
+ - seq_len (int, optional): The length of the sequence to use around each variant. Default is 200.
+ - batch_size (int, optional): Batch size for parallelization. Default is 1024.
+ - n_jobs (int, optional): Number of parallel jobs for processing variant bedlines. Default is 32.
+
+ Returns:
+ dict: A dictionary containing variant scores for each choice in the model_group.
+ Each entry has the choice name as the key and the corresponding scores as the value.
+ """
+ predictions = {}
+
+ # Parallelize the encoding of variant bedlines
+ encoded_seqs_list = Parallel(n_jobs=n_jobs, verbose=10)(
+ delayed(deepripe_encode_variant_bedline)(
+ bedline, genomefasta, flank_size=(seq_len // 2) + 2
)
- all_absplice_scores = list(output_generator)
+ for bedline in variant_bed
+ )
- logger.info("concatenating files")
- all_absplice_scores = pd.concat(all_absplice_scores)
- logger.info(f"saving score file to {ab_splice_agg_score_file}")
- all_absplice_scores.to_parquet(ab_splice_agg_score_file)
+ # Handle cases where encoding is None
+ encoded_seqs_list = [
+ (x if x is not None else np.ones((2, seq_len + 4, 4)) * float("nan"))
+ for x in encoded_seqs_list
+ ]
+
+ # Concatenate encoded sequences
+ encoded_seqs = tf.concat(encoded_seqs_list, 0)
+ logger.info("Computing predictions")
+
+ # Compute predictions for each choice in the model group
+ for choice in tqdm(model_group.keys(), desc="Model group"):
+ avg_score = 0.0
+ for i in range(4):
+ cropped_seqs = encoded_seqs[:, i : i + seq_len, :]
+ model, _ = model_group[choice]
+ pred = model.predict_on_batch(cropped_seqs)
+ wild_indices = tf.range(pred.shape[0], delta=2)
+ mut_indices = tf.range(1, pred.shape[0], delta=2)
+ pred_wild = pred[wild_indices, :]
+ pred_mut = pred[mut_indices, :]
+ score = pred_mut - pred_wild
+ avg_score += score
+ predictions[choice] = avg_score / 4
+
+ return predictions
+
+
+def calculate_scores_max(scores):
+ if scores is None:
+ return None
else:
- logger.info("reading existing abSplice Score file")
- all_absplice_scores = pd.read_parquet(ab_splice_agg_score_file)
+ # Split the string and extract values from index 1 to 5
+ values = [float(score) for score in scores.split("|")[1:5] if score != "nan"]
+ # Calculate the sum
+ if len(values) > 0:
+ return np.max(values)
+ else:
+ return np.NaN
+
+
+@cli.command()
+@click.argument("current_annotation_file", type=click.Path(exists=True))
+@click.argument("absplice_score_file", type=click.Path())
+@click.argument("out_file", type=click.Path())
+def merge_abscores(
+ current_annotation_file: str,
+ absplice_score_file: str,
+ out_file: str,
+):
+ """
+ Merge AbSplice scores with the current annotation file and save the result.
+
+ Parameters:
+ - current_annotation_file (str): Path to the current annotation file in parquet format.
+ - absplice_score_file (str): Path to the AbSplice scores file in parquet format.
+ - out_file (str): Path to save the merged annotation file with AbSplice scores.
+
+ Returns:
+ None
+
+ Notes:
+ - The function reads AbSplice scores and the current annotation file.
+ - It merges the AbSplice scores with the current annotation file based on chromosome, position, reference allele, alternative allele, and gene ID.
+ - The merged file is saved with AbSplice scores.
+
+ Example:
+ $ python annotations.py merge_abscores current_annotation.parquet absplice_scores.parquet merged_annotations.parquet
+ """
+ all_absplice_scores = pd.read_parquet(absplice_score_file)
all_absplice_scores = all_absplice_scores[
- ["chrom", "pos", "ref", "alt", "gene_id", "AbSplice_DNA", "id"]
+ ["chrom", "pos", "ref", "alt", "gene_id", "AbSplice_DNA"]
]
annotations = pd.read_parquet(current_annotation_file, engine="pyarrow").drop(
columns=["AbSplice_DNA"], errors="ignore"
)
- annotations.drop_duplicates(
- inplace=True, subset=["chrom", "pos", "ref", "alt", "gene_id", "id"]
- )
-
+ annotations = annotations.rename(columns={"Gene": "gene_id"})
+ annotations.drop_duplicates(inplace=True, subset=["gene_id", "id"])
original_len = len(annotations)
logger.info("Merging")
@@ -788,12 +1215,17 @@ def get_abscores(
all_absplice_scores,
validate="1:1",
how="left",
- on=["chrom", "pos", "ref", "alt", "gene_id", "id"],
+ on=["chrom", "pos", "ref", "alt", "gene_id"],
)
logger.info("Sanity checking merge")
assert len(merged) == original_len
- assert len(merged[["gene_id", "id"]].drop_duplicates()) == len(merged)
+ logger.info(
+ f"len of merged after dropping duplicates: {len(merged.drop_duplicates(subset=['id', 'gene_id']))}"
+ )
+ logger.info(f"len of merged without dropping duplicates: {len(merged)}")
+
+ assert len(merged.drop_duplicates(subset=["id", "gene_id"])) == len(merged)
logger.info(
f'Filling {merged["AbSplice_DNA"].isna().sum()} '
@@ -810,53 +1242,6 @@ def get_abscores(
pd.options.mode.chained_assignment = None
-logging.basicConfig(
- format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
- level="INFO",
- stream=sys.stdout,
-)
-logger = logging.getLogger(__name__)
-
-
-@cli.command()
-@click.option("--n-components", type=int, default=59)
-@click.argument("deepripe-file", type=click.Path(exists=True))
-@click.argument("out-dir", type=click.Path(exists=True))
-def deepripe_pca(n_components: int, deepripe_file: str, out_dir: str):
- logger.info("Reading deepripe file")
- df = pd.read_csv(deepripe_file)
- df = df.drop(["Uploaded_variant"], axis=1)
- logger.debug(df.columns)
- df = df.dropna()
- key_df = df[["chrom", "pos", "ref", "alt", "id"]].reset_index(drop=True)
-
- logger.info("Extracting matrix for PCA")
- X = df[[c for c in df.columns if c not in key_df.columns]].to_numpy()
- del df
- logger.info("transforming columns to z scores")
- X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
- del X
-
- logger.info("Running PCA")
- pca = PCA(n_components=n_components)
- pca.fit(X_std)
- out_path = Path(out_dir)
- with open(out_path / "pca.pkl", "wb") as f:
- pickle.dump(pca, f)
-
- logger.info(f"Projecting rows to {n_components} PCs")
- X_pca = pca.transform(X_std)
- del X_std
- pca_df = pd.DataFrame(
- X_pca, columns=[f"DeepRipe_PC_{i}" for i in range(1, n_components + 1)]
- )
- del X_pca
- pca_df = pd.concat([key_df, pca_df], axis=1)
- pca_df.to_parquet(out_path / "deepripe_pca.parquet", engine="pyarrow")
-
- logger.info("Done")
-
-
@cli.command()
@click.argument("annotation_file", type=click.Path(exists=True))
@click.argument("deepripe_file", type=click.Path(exists=True))
@@ -865,6 +1250,27 @@ def deepripe_pca(n_components: int, deepripe_file: str, out_dir: str):
def merge_deepripe(
annotation_file: str, deepripe_file: str, out_file: str, column_prefix: str
):
+ """
+ Merge deepRiPe scores with an annotation file and save the result.
+
+ Parameters:
+ - annotation_file (str): Path to the annotation file in parquet format.
+ - deepripe_file (str): Path to the deepRiPe scores file in CSV format.
+ - out_file (str): Path to save the merged file with deepRiPe scores.
+ - column_prefix (str): Prefix to add to the deepRiPe score columns in the merged file.
+
+ Returns:
+ None
+
+ Notes:
+ - The function reads the annotation file and deepRiPe scores file.
+ - It renames the columns in the deepRiPe scores file with the specified prefix.
+ - The two dataframes are merged based on chromosome, position, reference allele, alternative allele, and variant ID.
+ - The merged file is saved with deepRiPe scores.
+
+ Example:
+ $ python annotations.py merge_deepripe annotations.parquet deepripe_scores.csv merged_deepripe.parquet deepripe
+ """
annotations = pd.read_parquet(annotation_file)
deepripe_df = pd.read_csv(deepripe_file)
orig_len = len(annotations)
@@ -888,18 +1294,69 @@ def merge_deepripe(
@cli.command()
@click.argument("annotation_file", type=click.Path(exists=True))
@click.argument("deepripe_pca_file", type=click.Path(exists=True))
+@click.argument("column_yaml_file", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
-def merge_deepsea_pcas(annotation_file: str, deepripe_pca_file: str, out_file: str):
- annotations = pd.read_parquet(annotation_file)
- deepripe_pcas = pd.read_parquet(deepripe_pca_file)
+def merge_deepsea_pcas(
+ annotation_file: str, deepripe_pca_file: str, column_yaml_file: str, out_file: str
+):
+ """
+ Merge deepRiPe PCA scores with an annotation file and save the result.
+
+ Parameters:
+ - annotation_file (str): Path to the annotation file in parquet format.
+ - deepripe_pca_file (str): Path to the deepRiPe PCA scores file in parquet format.
+ - column_yaml_file(str): Path to the yaml file containing all needed columns for the model, including their filling values.
+ - out_file (str): Path to save the merged file with deepRiPe PCA scores.
+
+ Returns:
+ None
+
+ Notes:
+ - The function reads the annotation file and deepRiPe PCA scores file.
+ - It drops duplicates in both files based on chromosome, position, reference allele, alternative allele, variant ID, and gene ID.
+ - The two dataframes are merged based on chromosome, position, reference allele, alternative allele, and variant ID.
+ - The merged file is saved with deepRiPe PCA scores.
+
+ Example:
+ $ python annotations.py merge_deepsea_pcas annotations.parquet deepripe_pca_scores.parquet merged_deepsea_pcas.parquet
+ """
+
+ pcols = get_parquet_columns(deepripe_pca_file)
+ anno_cols = get_parquet_columns(annotation_file)
+ logger.info("reading current annotations")
+ prior_names, *_ = readYamlColumns(column_yaml_file)
+
+ DScommonCols = list(set(prior_names).intersection(set(pcols)))
+ AnnoCommonCols = list(set(prior_names).intersection(set(anno_cols)))
+ annotations = pd.read_parquet(
+ annotation_file,
+ columns=AnnoCommonCols + ["chrom", "pos", "ref", "alt", "id", "Gene"],
+ )
+ logger.info("reading PCAs")
+ deepripe_pcas = pd.read_parquet(
+ deepripe_pca_file, columns=DScommonCols + ["chrom", "pos", "ref", "alt", "id"]
+ )
+ deepripe_pcas = deepripe_pcas.drop_duplicates(
+ subset=["chrom", "pos", "ref", "alt", "id"]
+ )
orig_len = len(annotations)
+ logger.info(f"length of annotation file before merge: {orig_len}")
+ annotations = annotations.drop_duplicates(
+ subset=["chrom", "pos", "ref", "alt", "id", "Gene"]
+ )
+ noduplicates_len = len(annotations)
+ logger.info(
+ f"length of annotation file after dropping duplicates: {noduplicates_len}"
+ )
+ logger.info("merging")
merged = annotations.merge(
deepripe_pcas, how="left", on=["chrom", "pos", "ref", "alt", "id"]
)
- merged.rename(columns={"Gene": "gene_id"}, inplace=True)
-
- assert len(merged) == orig_len
+ logger.info(f"length of annotation file after merge: {len(merged)}")
+ logger.info("checking lengths")
+ assert len(merged) == noduplicates_len
+ logger.info(f"writing file to {out_file}")
merged.to_parquet(out_file)
@@ -907,6 +1364,26 @@ def merge_deepsea_pcas(annotation_file: str, deepripe_pca_file: str, out_file: s
@click.argument("in_variants", type=click.Path(exists=True))
@click.argument("out_variants", type=click.Path())
def process_annotations(in_variants: str, out_variants: str):
+ """
+ Process variant annotations, filter for canonical variants, and aggregate consequences.
+
+ Parameters:
+ - in_variants (str): Path to the input variant annotation file in parquet format.
+ - out_variants (str): Path to save the processed variant annotation file in parquet format.
+
+ Returns:
+ None
+
+ Notes:
+ - The function reads the input variant annotation file.
+ - It filters for canonical variants where the 'CANONICAL' column is equal to 'YES'.
+ - The 'Gene' column is renamed to 'gene_id'.
+ - Consequences for different alleles are aggregated by combining the variant ID with the gene ID.
+ - The processed variant annotations are saved to the specified output file.
+
+ Example:
+ $ python annotations.py process_annotations input_variants.parquet output_variants.parquet
+ """
variant_path = Path(in_variants)
variants = pd.read_parquet(variant_path)
@@ -919,10 +1396,36 @@ def process_annotations(in_variants: str, out_variants: str):
# combining variant id with gene id
variants["censequence_id"] = variants["id"].astype(str) + variants["gene_id"]
- variants.to_parquet(out_variants)
+ variants.to_parquet(out_variants, compression="zstd")
+
+
+def process_chunk_addids(chunk: pd.DataFrame, variants: pd.DataFrame) -> pd.DataFrame:
+ """
+ Process a chunk of data by adding identifiers from a variants dataframe.
+ Parameters:
+ - chunk (pd.DataFrame): Chunk of data containing variant information.
+ - variants (pd.DataFrame): Dataframe containing variant identifiers.
-def process_chunk_addids(chunk, variants):
+ Returns:
+ pd.DataFrame: Processed chunk with added variant identifiers.
+
+ Raises:
+ AssertionError: If the shape of the processed chunk does not match expectations.
+
+ Notes:
+ - The function renames columns for compatibility.
+ - Drops duplicates in the chunk based on the key columns.
+ - Merges the chunk with the variants dataframe based on the key columns.
+ - Performs assertions to ensure the shape of the processed chunk meets expectations.
+
+ Example:
+ ```python
+ chunk = pd.read_csv("chunk_data.csv")
+ variants = pd.read_csv("variants_data.csv")
+ processed_chunk = process_chunk_addids(chunk, variants)
+ ```
+ """
chunk = chunk.rename(
columns={
"#CHROM": "chrom",
@@ -934,8 +1437,10 @@ def process_chunk_addids(chunk, variants):
}
)
key_cols = ["chrom", "pos", "ref", "alt"]
+
chunk.drop_duplicates(subset=key_cols, inplace=True)
chunk_shape = chunk.shape
+
chunk = pd.merge(chunk, variants, on=key_cols, how="left", validate="1:1")
try:
@@ -961,56 +1466,149 @@ def process_chunk_addids(chunk, variants):
@click.argument("njobs", type=int)
@click.argument("out_file", type=click.Path())
def add_ids(annotation_file: str, variant_file: str, njobs: int, out_file: str):
- data = pd.read_csv(annotation_file, chunksize=10_000)
+ """
+ Add identifiers from a variant file to an annotation file and save the result.
+
+ Parameters:
+ - annotation_file (str): Path to the input annotation file in CSV format.
+ - variant_file (str): Path to the input variant file in TSV format.
+ - njobs (int): Number of parallel jobs to process the data.
+ - out_file (str): Path to save the processed data in Parquet format.
+
+ Returns:
+ None
+
+ Notes:
+ - The function reads the annotation file in chunks and the entire variant file.
+ - It uses parallel processing to apply the 'process_chunk_addids' function to each chunk.
+ - The result is saved in Parquet format.
+
+ Example:
+ $ python annotations.py add_ids annotation_data.csv variant_data.tsv 4 processed_data.parquet
+ """
+
+ data = pd.read_csv(annotation_file, chunksize=100_000)
all_variants = pd.read_csv(variant_file, sep="\t")
- parallel = Parallel(n_jobs=njobs, return_as="generator")
+ parallel = Parallel(n_jobs=njobs, return_as="generator", verbose=50)
output_generator = parallel(
delayed(process_chunk_addids)(chunk, all_variants) for chunk in data
)
- pd.concat([batch for batch in output_generator]).to_csv(out_file, index=False)
+ first = True
+ for batch in tqdm(output_generator):
+ if first:
+ batch.to_parquet(out_file, engine="fastparquet")
+ else:
+ batch.to_parquet(out_file, engine="fastparquet", append=True)
+ first = False
@cli.command()
-@click.option("--included-chromosomes", type=str)
-@click.option("--comment-lines", is_flag=True)
-@click.option("--sep", type=str, default=",")
-@click.argument("annotation_dir", type=click.Path(exists=True))
-@click.argument("deepripe_name_pattern", type=str)
-@click.argument("pvcf-blocks_file", type=click.Path(exists=True))
+@click.argument("annotation_file", type=click.Path(exists=True))
+@click.argument("variant_file", type=click.Path(exists=True))
+@click.argument("njobs", type=int)
@click.argument("out_file", type=click.Path())
-def concatenate_deepripe(
- included_chromosomes: Optional[str],
- sep: str,
- comment_lines: bool,
- annotation_dir: str,
- deepripe_name_pattern: str,
- pvcf_blocks_file: str,
+def add_ids_dask(annotation_file: str, variant_file: str, njobs: int, out_file: str):
+ """
+ Add identifiers from a variant file to an annotation file using Dask and save the result.
+
+ Parameters:
+ - annotation_file (str): Path to the input annotation file in Parquet format.
+ - variant_file (str): Path to the input variant file in Parquet format.
+ - njobs (int): Number of parallel jobs to process the data.
+ - out_file (str): Path to save the processed data in Parquet format.
+
+ Returns:
+ None
+
+ Notes:
+ - The function uses Dask to read annotation and variant files with large block sizes.
+ - It renames columns for compatibility and drops duplicates based on key columns.
+ - Merges the Dask dataframes using the 'merge' function.
+ - The result is saved in Parquet format with compression.
+
+ Example:
+ $ python annotations.py add_ids_dask annotation_data.parquet variant_data.parquet 4 processed_data.parquet
+ """
+ data = dd.read_parquet(annotation_file, blocksize=25e9)
+ all_variants = pd.read_table(variant_file)
+ data = data.rename(
+ columns={
+ "#CHROM": "chrom",
+ "POS": "pos",
+ "ID": "variant_name",
+ "REF": "ref",
+ "ALT": "alt",
+ "chr": "chrom",
+ }
+ )
+ key_cols = ["chrom", "pos", "ref", "alt"]
+ data.drop_duplicates(subset=key_cols, inplace=True)
+ data = dd.merge(data, all_variants, on=key_cols, how="left")
+ data.to_parquet(out_file, engine="fastparquet", compression="zstd")
+
+
+def chunks(lst, n):
+ """
+ Split a list into chunks of size 'n'.
+
+ Parameters:
+ - lst (list): The input list to be split into chunks.
+ - n (int): The size of each chunk.
+
+ Yields:
+ list: A chunk of the input list.
+ """
+ for i in range(0, len(lst), n):
+ yield lst[i : i + n]
+
+
+def read_deepripe_file(f: str):
+ """
+ Read a DeepRipe file from the specified path.
+
+ Parameters:
+ - f (str): Path to the DeepRipe file.
+
+ Returns:
+ pd.DataFrame: DataFrame containing the data from the DeepRipe file.
+
+ Example:
+ ```python
+ file_path = "path/to/deepripe_file.txt"
+ deepripe_data = read_deepripe_file(file_path)
+ ```
+ """
+ f = pd.read_table(f, engine="c")
+ return f
+
+
+@cli.command()
+@click.argument("deepsea_files", type=str)
+@click.argument("out_file", type=click.Path())
+@click.argument("njobs", type=int)
+def concatenate_deepsea(
+ deepsea_files: str,
out_file: str,
+ njobs: int,
):
- annotation_dir = Path(annotation_dir)
+ """
+ Concatenate DeepSEA files based on the provided patterns and chromosome blocks.
- logger.info("Reading variant file")
+ Parameters:
+ - deepSEA_name_pattern (str): comma-separated list of deepsea files to concatenate
+ - out_file (str): Path to save the concatenated output file in Parquet format.
+ - njobs (int): Number of parallel jobs for processing.
- logger.info("reading pvcf block file")
- pvcf_blocks_df = pd.read_csv(
- pvcf_blocks_file,
- sep="\t",
- header=None,
- names=["Index", "Chromosome", "Block", "First position", "Last position"],
- dtype={"Chromosome": str},
- ).set_index("Index")
- if included_chromosomes is not None:
- included_chromosomes = [int(c) for c in included_chromosomes.split(",")]
- pvcf_blocks_df = pvcf_blocks_df[
- pvcf_blocks_df["Chromosome"].isin([str(c) for c in included_chromosomes])
- ]
- pvcf_blocks = zip(pvcf_blocks_df["Chromosome"], pvcf_blocks_df["Block"])
- file_paths = [
- annotation_dir / deepripe_name_pattern.format(chr=p[0], block=p[1])
- for p in pvcf_blocks
- ]
+ Returns:
+ None
+
+ Example:
+ $ python annotations.py concatenate_deepSEA chr1_block0.CLI.deepseapredict.diff.tsv,chr1_block1.CLI.deepseapredict.diff.tsv,chr1_block2.CLI.deepseapredict.diff.tsv concatenated_output.parquet 4
+ """
+
+ file_paths = deepsea_files.split(",")
logger.info("check if out_file already exists")
if os.path.exists(out_file):
logger.info("file exists, removing existing file")
@@ -1019,16 +1617,28 @@ def concatenate_deepripe(
logger.info("out_file does not yet exist")
logger.info("reading in f")
- for f in tqdm(file_paths):
- if comment_lines:
- current_file = pd.read_csv(f, comment="#", sep=sep, low_memory=False)
- else:
- current_file = pd.read_csv(f, sep=sep, low_memory=False)
- if f == file_paths[0]:
+
+ parallel = Parallel(n_jobs=njobs, backend="loky", return_as="generator")
+ chunked_files = list(chunks(file_paths, njobs))
+ logger.info(f"processing {len(chunked_files)} files")
+ for chunk in tqdm(chunked_files):
+ logger.info(f"Chunk consist of {len(chunk)} files")
+ this_generator = parallel((delayed(read_deepripe_file)(f) for f in chunk))
+ current_file = pd.concat(list(this_generator))
+ if chunk == chunked_files[0]:
logger.info("creating new file")
- current_file.to_csv(out_file, mode="a", index=False)
+ current_file.to_parquet(out_file, engine="fastparquet")
else:
- current_file.to_csv(out_file, mode="a", index=False, header=False)
+ try:
+ current_file.to_parquet(out_file, engine="fastparquet", append=True)
+ except ValueError:
+ out_df_columns = pd.read_parquet(out_file, engine="fastparquet").columns
+
+ logger.error(
+ f"columns are not equal in saved/appending file: {[i for i in out_df_columns if i not in current_file.columns]} and {[i for i in current_file.columns if i not in out_df_columns]} "
+ )
+
+ raise ValueError
@cli.command()
@@ -1038,7 +1648,9 @@ def concatenate_deepripe(
@click.argument("deepripe_hg2_file", type=click.Path(exists=True))
@click.argument("deepripe_k5_file", type=click.Path(exists=True))
@click.argument("variant_file", type=click.Path(exists=True))
+@click.argument("vcf_file", type=click.Path(exists=True))
@click.argument("out_file", type=click.Path())
+@click.option("--vepcols_to_retain", type=str)
def merge_annotations(
vep_header_line: int,
vep_file: str,
@@ -1046,42 +1658,61 @@ def merge_annotations(
deepripe_hg2_file: str,
deepripe_k5_file: str,
variant_file: str,
+ vcf_file: str,
out_file: str,
+ vepcols_to_retain: Optional[str],
):
+ """
+ Merge VEP, DeepRipe (parclip, hg2, k5), and variant files into one dataFrame and save result as parquet file
+
+ Parameters:
+ - vep_header_line (int): Line number of the header line in the VEP output file.
+ - vep_file (str): Path to the VEP file.
+ - deepripe_parclip_file (str): Path to the DeepRipe parclip file.
+ - deepripe_hg2_file (str): Path to the DeepRipe hg2 file.
+ - deepripe_k5_file (str): Path to the DeepRipe k5 file.
+ - variant_file (str): Path to the variant file.
+ - vcf_file (str): vcf file containing chrom, pos, ref and alt information
+ - out_file (str): Path to save the merged output file in Parquet format.
+ - vepcols_to_retain (Optional[str]): Comma-separated list of additional VEP columns to retain.
+
+ Returns:
+ None
+
+ Example:
+ $ python annotations.py merge_annotations 1 vep_file.tsv deepripe_parclip.csv deepripe_hg2.csv deepripe_k5.csv variant_file.tsv merged_output.parquet --vepcols_to_retain="AlphaMissense,PolyPhen"
+ """
# load vep file
- vep_df = pd.read_csv(
- vep_file,
- header=vep_header_line,
- sep="\t",
- na_values="-",
+ vep_df = pd.read_csv(vep_file, header=vep_header_line, sep="\t", na_values="-")
+ if vepcols_to_retain is not None:
+ vepcols_to_retain = [c for c in vepcols_to_retain.split(",")]
+ vep_df = process_vep(
+ vep_file=vep_df, vcf_file=vcf_file, vepcols_to_retain=vepcols_to_retain
)
- vep_df = process_vep(vep_file=vep_df)
logger.info(f"vep_df shape is {vep_df.shape}")
- # load deepripe_parclip
+ logger.info("load deepripe_parclip")
+
deepripe_parclip_df = pd.read_csv(deepripe_parclip_file)
deepripe_parclip_df = process_deepripe(deepripe_parclip_df, "parclip")
- # load deepripe_k5
+ logger.info("load deepripe_k5")
+
deepripe_k5_df = pd.read_csv(deepripe_k5_file)
deepripe_k5_df = process_deepripe(deepripe_k5_df, "k5")
- # load deepripe_hg2
+ logger.info("load deepripe_hg2")
+
deepripe_hg2_df = pd.read_csv(deepripe_hg2_file)
deepripe_hg2_df = process_deepripe(deepripe_hg2_df, "hg2")
- # load variant_file
+ logger.info("load variant_file")
+
logger.info(f"reading in {variant_file}")
variants = pd.read_csv(variant_file, sep="\t")
- # If variants start with chr
- # TODO Check if this is always true
- variants["chrom"] = variants["chrom"].str.replace("chr", "")
-
- # merge vep to variants M:1
+ logger.info("merge vep to variants M:1")
ca = vep_df.merge(
variants, how="left", on=["chrom", "pos", "ref", "alt"], validate="m:1"
)
del vep_df
- # merge deepripe files to variants 1:1
- logger.info(ca.columns)
- logger.info(deepripe_parclip_df.columns)
+ logger.info("merge deepripe files to variants 1:1")
ca = ca.merge(
deepripe_parclip_df,
how="left",
@@ -1095,10 +1726,23 @@ def merge_annotations(
deepripe_hg2_df, how="left", on=["chrom", "pos", "ref", "alt"], validate="m:1"
)
- ca.to_parquet(out_file)
+ ca.to_parquet(out_file, compression="zstd")
+
+
+def process_deepripe(deepripe_df: pd.DataFrame, column_prefix: str) -> pd.DataFrame:
+ """
+ Process the DeepRipe DataFrame, rename columns and drop duplicates.
+ Parameters:
+ - deepripe_df (pd.DataFrame): DataFrame containing DeepRipe data.
+ - column_prefix (str): Prefix to be added to column names.
-def process_deepripe(deepripe_df: object, column_prefix: str) -> object:
+ Returns:
+ pd.DataFrame: Processed DeepRipe DataFrame.
+
+ Example:
+ deepripe_df = process_deepripe(deepripe_df, "parclip")
+ """
logger.info("renaming deepripe columns")
deepripe_df = deepripe_df.rename(columns={"chr": "chrom"})
@@ -1113,46 +1757,49 @@ def process_deepripe(deepripe_df: object, column_prefix: str) -> object:
return deepripe_df
-def process_vep(vep_file: object) -> object:
- vep_file[["chrom", "pos", "ref", "alt"]] = (
- vep_file["#Uploaded_variation"]
- .str.replace("_", ":")
- .str.replace("/", ":")
- .str.split(":", expand=True)
- )
-
- vep_file["pos"] = vep_file["pos"].astype(int)
- logger.debug(vep_file.columns)
-
- vep_str_cols = [
- "CDS_position",
- "Protein_position",
- "Amino_acids",
- "Codons",
- "SYMBOL",
- "SYMBOL_SOURCE",
- "HGNC_ID",
- "MANE_SELECT",
- "APPRIS",
- "CCDS",
- "ENSP",
- "SWISSPROT",
- "TREMBL",
- "UNIPARC",
- "UNIPROT_ISOFORM",
- "RefSeq",
- "SIFT",
- "PolyPhen",
- "INTRON",
- "DOMAINS",
- "HGVSp",
- "SpliceAI_pred",
+def process_vep(
+ vep_file: pd.DataFrame, vcf_file: str, vepcols_to_retain: list = []
+) -> pd.DataFrame:
+ """
+ Process the VEP DataFrame, extracting relevant columns and handling data types.
+
+ Parameters:
+ - vep_file (pd.DataFrame): DataFrame containing VEP data.
+ - vepcols_to_retain (list, optional): List of additional columns to retain. Defaults to an empty list.
+
+ Returns:
+ pd.DataFrame: Processed VEP DataFrame.
+
+ Example:
+ vep_file = process_vep(vep_file, vepcols_to_retain=["additional_col1", "additional_col2"])
+ """
+ vcf_df = pd.read_table(
+ vcf_file, names=["chrom", "pos", "#Uploaded_variation", "ref", "alt"]
+ )
+ if "#Uploaded_variation" in vep_file.columns:
+ vep_file = vep_file.merge(vcf_df, on="#Uploaded_variation")
+
+ if "pos" in vep_file.columns:
+ vep_file["pos"] = vep_file["pos"].astype(int)
+
+ vep_file["chrom"] = vep_file["chrom"].apply(
+ lambda x: "{}{}".format("chr", x.split("chr")[-1])
+ )
+
+ str_cols = [
"STRAND",
"TSL",
"GENE_PHENO",
+ "CADD_PHRED",
+ "CADD_RAW",
+ "SpliceAI_pred",
+ "BIOTYPE",
+ "Gene",
]
+ str_cols_present = [i for i in str_cols if i in vep_file.columns]
+ vep_file[str_cols_present] = vep_file[str_cols_present].astype(str)
- vep_float_cols = [
+ float_vals = [
"DISTANCE",
"gnomADg_FIN_AF",
"AF",
@@ -1170,89 +1817,122 @@ def process_vep(vep_file: object) -> object:
"TSL",
"Condel",
]
+ float_vals_present = [i for i in float_vals if i in vep_file.columns]
+ vep_file[float_vals_present] = (
+ vep_file[float_vals_present].replace("-", "NaN").astype(float)
+ )
+
+ necessary_columns = (
+ [
+ "chrom",
+ "pos",
+ "ref",
+ "alt",
+ "Gene",
+ "gnomADe_NFE_AF",
+ "CADD_PHRED",
+ "CADD_RAW",
+ "Consequence",
+ "PrimateAI",
+ "Alpha_Missense",
+ "am_pathogenicity",
+ "AbSplice_DNA",
+ "PolyPhen",
+ "SIFT",
+ "SIFT_score",
+ "PolyPhen_score",
+ "UKB_AF",
+ "combined_UKB_NFE_AF",
+ "combined_UKB_NFE_AF_MB",
+ "gene_id",
+ "Condel",
+ ]
+ + str_cols
+ + float_vals
+ + (vepcols_to_retain or [])
+ )
+ necessary_columns_present = [i for i in necessary_columns if i in vep_file.columns]
+
+ vep_file = vep_file[list(set(necessary_columns_present))]
+
+ if "SpliceAI_pred" in vep_file.columns:
+ vep_file["SpliceAI_delta_score"] = vep_file["SpliceAI_pred"].apply(
+ calculate_scores_max
+ )
+
+ if "Consequence" in vep_file.columns:
+ dummies = (
+ vep_file["Consequence"].str.get_dummies(",").add_prefix("Consequence_")
+ )
+ else:
+ raise ValueError("'Consequence' column expected to be in VEP output")
all_consequences = [
- "Consequence_3_prime_UTR_variant" "Consequence_5_prime_UTR_variant",
- "Consequence_NMD_transcript_variant",
+ "Consequence_splice_acceptor_variant",
+ "Consequence_5_prime_UTR_variant",
"Consequence_TFBS_ablation",
- "Consequence_TF_binding_site_variant",
- "Consequence_coding_sequence_variant",
- "Consequence_downstream_gene_variant",
- "Consequence_frameshift_variant",
+ "Consequence_start_lost",
"Consequence_incomplete_terminal_codon_variant",
- "Consequence_inframe_deletion",
- "Consequence_inframe_insertion",
- "Consequence_intergenic_variant",
"Consequence_intron_variant",
- "Consequence_mature_miRNA_variant",
- "Consequence_missense_variant",
- "Consequence_non_coding_transcript_exon_variant",
- "Consequence_non_coding_transcript_variant",
- "Consequence_protein_altering_variant",
- "Consequence_regulatory_region_variant",
- "Consequence_splice_acceptor_variant",
+ "Consequence_stop_gained",
"Consequence_splice_donor_5th_base_variant",
- "Consequence_splice_donor_region_variant",
+ "Consequence_downstream_gene_variant",
+ "Consequence_intergenic_variant",
"Consequence_splice_donor_variant",
+ "Consequence_NMD_transcript_variant",
+ "Consequence_protein_altering_variant",
"Consequence_splice_polypyrimidine_tract_variant",
- "Consequence_splice_region_variant",
- "Consequence_start_lost",
- "Consequence_start_retained_variant",
- "Consequence_stop_gained",
+ "Consequence_inframe_insertion",
+ "Consequence_mature_miRNA_variant",
+ "Consequence_synonymous_variant",
+ "Consequence_regulatory_region_variant",
+ "Consequence_non_coding_transcript_exon_variant",
"Consequence_stop_lost",
+ "Consequence_TF_binding_site_variant",
+ "Consequence_splice_donor_region_variant",
"Consequence_stop_retained_variant",
- "Consequence_synonymous_variant",
+ "Consequence_splice_region_variant",
+ "Consequence_coding_sequence_variant",
"Consequence_upstream_gene_variant",
+ "Consequence_frameshift_variant",
+ "Consequence_start_retained_variant",
+ "Consequence_3_prime_UTR_variant",
+ "Consequence_inframe_deletion",
+ "Consequence_missense_variant",
+ "Consequence_non_coding_transcript_variant",
]
-
- dummies = vep_file["Consequence"].str.get_dummies(",").add_prefix("Consequence_")
+ all_consequences = list(set(all_consequences))
mask = pd.DataFrame(
data=np.zeros(shape=(len(vep_file), len(all_consequences))),
columns=all_consequences,
- dtype="Int8",
+ dtype=float,
)
-
mask[list(dummies.columns)] = dummies
vep_file[mask.columns] = mask
- vep_file[vep_str_cols] = vep_file[vep_str_cols].astype(str)
- vep_file[vep_float_cols] = vep_file[vep_float_cols].astype(float)
- vep_file[all_consequences] = vep_file[all_consequences].astype("Int8")
-
return vep_file
@cli.command()
-@click.argument("pvcf_blocks_file", type=click.Path(exists=True))
-@click.argument("annotation_dir", type=click.Path(exists=True))
-@click.argument("filename_pattern", type=str)
+@click.argument("filenames", type=str)
@click.argument("out_file", type=click.Path())
-@click.option("--included-chromosomes", type=str)
def concat_annotations(
- pvcf_blocks_file: str,
- annotation_dir: str,
- filename_pattern: str,
+ filenames: str,
out_file: str,
- included_chromosomes: Optional[str],
):
- logger.info("reading pvcf block file")
- pvcf_blocks_df = pd.read_csv(
- pvcf_blocks_file,
- sep="\t",
- header=None,
- names=["Index", "Chromosome", "Block", "First position", "Last position"],
- dtype={"Chromosome": str},
- ).set_index("Index")
- if included_chromosomes is not None:
- included_chromosomes = [int(c) for c in included_chromosomes.split(",")]
- pvcf_blocks_df = pvcf_blocks_df[
- pvcf_blocks_df["Chromosome"].isin([str(c) for c in included_chromosomes])
- ]
- pvcf_blocks = zip(pvcf_blocks_df["Chromosome"], pvcf_blocks_df["Block"])
- annotation_dir = Path(annotation_dir)
- file_paths = [
- annotation_dir / filename_pattern.format(chr=p[0], block=p[1])
- for p in pvcf_blocks
- ]
+ """
+ Concatenate multiple annotation files based on the specified pattern and create a single output file.
+
+ Parameters:
+ - filenames (str): File paths for annotation files to concatenate
+ - out_file (str): Output file path.
+
+ Returns:
+ None
+
+ Example:
+ concat_annotations "annotations/chr1_block0_merged.parquet,annotations/chr1_block1_merged.parquet,annotations/chr1_block2_merged.parquet " "output.parquet")
+ """
+ file_paths = filenames.split(",")
for f in tqdm(file_paths):
logger.info(f"processing file {f}")
file = pd.read_parquet(f)
@@ -1275,5 +1955,165 @@ def concat_annotations(
raise ValueError
+@cli.command()
+@click.argument("genotype_file", type=click.Path(exists=True))
+@click.argument("variants_filepath", type=click.Path(exists=True))
+@click.argument("out_file", type=click.Path())
+def get_af_from_gt(genotype_file: str, variants_filepath: str, out_file: str):
+ """
+ Compute allele frequencies from genotype data.
+
+ Parameters:
+ - genotype_file (str): Path to the genotype file.
+ - variants_filepath (str): Path to the variants file.
+ - out_file (str): Output file path for storing allele frequencies.
+ """
+ import h5py
+
+ variants = pd.read_table(variants_filepath)
+ max_variant_id = variants["id"].max()
+
+ logger.info("Computing allele frequencies")
+ variant_counts = np.zeros(max_variant_id + 1)
+ with h5py.File(genotype_file, "r") as f:
+ variant_matrix = f["variant_matrix"]
+ genotype_matrix = f["genotype_matrix"]
+ n_samples = variant_matrix.shape[0]
+ for i in trange(n_samples):
+ variants = variant_matrix[i]
+ mask = variants > 0
+ variants = variants[mask]
+ genotype = genotype_matrix[i]
+ genotype = genotype[mask]
+ variant_counts[variants] += genotype
+
+ af = variant_counts / (2 * n_samples)
+ af_df = pd.DataFrame({"id": np.arange(max_variant_id + 1), "af": af})
+ af_df.to_parquet(out_file)
+
+
+@cli.command()
+@click.argument("annotations_path", type=click.Path(exists=True))
+@click.argument("af_df_path", type=click.Path(exists=True))
+@click.argument("out_file", type=click.Path())
+def merge_af(annotations_path: str, af_df_path: str, out_file: str):
+ """
+ Merge allele frequency data into annotations and save to a file.
+
+ Parameters:
+ - annotations_path (str): Path to the annotations file.
+ - af_df_path (str): Path to the allele frequency DataFrame file.
+ - out_file (str): Path to the output file to save merged data.
+ """
+ annotations_df = pd.read_parquet(annotations_path)
+ af_df = pd.read_parquet(af_df_path)
+ merged_df = annotations_df.merge(af_df, how="left", on="id")
+ merged_df.to_parquet(out_file)
+
+
+@cli.command()
+@click.argument("annotations_path", type=click.Path(exists=True))
+@click.argument("out_file", type=click.Path())
+def calculate_maf(annotations_path: str, out_file: str):
+ """
+ Calculate minor allele frequency (MAF) from allele frequency data in annotations.
+
+ Parameters:
+ - annotations_path (str): Path to the annotations file containing allele frequency data.
+ - out_file (str): Path to the output file to save the calculated MAF data.
+ """
+ annotation_file = pd.read_parquet(annotations_path)
+ af = annotation_file["af"]
+ annotation_file = annotation_file.drop(
+ columns=["UKB_AF_MB", "UKB_MAF"], errors="ignore"
+ )
+ annotation_file["maf"] = af.apply(lambda x: min(x, 1 - x))
+ annotation_file["maf_mb"] = (af * (1 - af) + 1e-8) ** (-0.5)
+ annotation_file.to_parquet(out_file)
+
+
+@cli.command()
+@click.argument("protein_id_file", type=click.Path(exists=True))
+@click.argument("annotations_path", type=click.Path(exists=True))
+@click.argument("out_file", type=click.Path())
+def add_protein_ids(protein_id_file: str, annotations_path: str, out_file: str):
+ """
+ Add protein IDs to the annotations based on protein ID mapping file.
+
+ Parameters:
+ - protein_id_file (str): Path to the protein ID mapping file.
+ - annotations_path (str): Path to the annotations file.
+ - out_file (str): Path to the output file to save the annotations with protein IDs.
+ """
+ genes = pd.read_parquet(protein_id_file)
+ genes[["gene_base", "feature"]] = genes["gene"].str.split(".", expand=True)
+ genes.drop(columns=["feature", "gene", "gene_name", "gene_type"], inplace=True)
+ genes.rename(columns={"id": "gene_id"}, inplace=True)
+ annotations = pd.read_parquet(annotations_path)
+ len_anno = len(annotations)
+ annotations.rename(columns={"gene_id": "gene_base"}, inplace=True)
+ merged = annotations.merge(genes, on=["gene_base"], how="left")
+ assert len(merged) == len_anno
+ merged.to_parquet(out_file)
+
+
+@cli.command()
+@click.argument("gtf_filepath", type=click.Path(exists=True))
+@click.argument("out_file", type=click.Path())
+def create_protein_id_file(gtf_filepath: str, out_file: str):
+ """
+ Create a protein ID mapping file from the GTF file.
+
+ Parameters:
+ - gtf_filepath (str): Path to the GTF file.
+ - out_file (str): Path to save the output protein ID mapping file.
+ """
+ import pyranges as pr
+
+ gtf = pr.read_gtf(gtf_filepath)
+ gtf = gtf.as_df()
+ gtf = gtf.query(
+ "Feature =='gene' and gene_type=='protein_coding' and Source=='HAVANA'"
+ )
+ gtf = (
+ gtf[["gene_id", "gene_type", "gene_name"]]
+ .reset_index(drop=True)
+ .reset_index()
+ .rename(columns={"gene_id": "gene", "index": "id"})
+ )
+ gtf.to_parquet(out_file)
+
+
+@cli.command()
+@click.argument("annotation_columns_yaml_file", type=click.Path(exists=True))
+@click.argument("annotations_path", type=click.Path(exists=True))
+@click.argument("out_file", type=click.Path())
+def select_rename_fill_annotations(
+ annotation_columns_yaml_file: str, annotations_path: str, out_file: str
+):
+ """
+ Select, rename, and fill missing values in annotation columns based on a YAML configuration file.
+
+ Parameters:
+ - annotation_columns_yaml_file (str): Path to the YAML file containing name and fill value mappings.
+ - annotations_path (str): Path to the annotations file.
+ - out_file (str): Path to save the modified annotations file.
+ """
+
+ logger.info(
+ f"reading in yaml file containing name and fill value mappings from {annotation_columns_yaml_file}"
+ )
+ prior_names, _, _, column_name_mapping, fill_value_mapping = readYamlColumns(
+ annotation_columns_yaml_file
+ )
+ key_cols = ["id", "gene_id"]
+ anno_df = pd.read_parquet(
+ annotations_path, columns=list(set(prior_names + key_cols))
+ )
+ anno_df.rename(columns=column_name_mapping, inplace=True)
+ anno_df.fillna(fill_value_mapping, inplace=True)
+ anno_df.to_parquet(out_file)
+
+
if __name__ == "__main__":
cli()
diff --git a/deeprvat/cv_utils.py b/deeprvat/cv_utils.py
new file mode 100644
index 00000000..68a285e5
--- /dev/null
+++ b/deeprvat/cv_utils.py
@@ -0,0 +1,215 @@
+import pandas as pd
+import yaml
+import os
+import sys
+from typing import Optional
+import re
+
+# import pickle
+import logging
+import click
+import copy
+import zarr
+import numpy as np
+from numcodecs import Blosc
+from pathlib import Path
+from deeprvat.utils import (
+ standardize_series,
+ my_quantile_transform,
+)
+
+logging.basicConfig(
+ format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
+ level="INFO",
+ stream=sys.stdout,
+)
+logger = logging.getLogger(__name__)
+DATA_SLOT_DICT = {
+ "deeprvat": ["data", "training_data"],
+ "seed_genes": ["data"],
+}
+
+module_folder_dict = {
+ "seed_genes": "baseline",
+ "deeprvat": "deeprvat",
+ "alternative_burdens": "alternative_burdens",
+}
+
+
+@click.group()
+def cli():
+ pass
+
+
+@cli.command()
+@click.option("--module", "-m", multiple=True)
+@click.option("--fold", type=int)
+@click.option("--fold-specific-baseline", is_flag=True)
+@click.option("--n-folds", type=int, default=5)
+@click.argument("input_config", type=click.Path(exists=True))
+@click.argument("out_path", type=click.Path(), default="./")
+def spread_config(
+ input_config, out_path, module, fold_specific_baseline, fold, n_folds
+):
+ data_modules = module
+
+ with open(input_config) as f:
+ config_template = yaml.safe_load(f)
+ split = "train"
+ cv_path = f"{config_template['cv_path']}/{n_folds}_fold"
+ for module in data_modules:
+ config = copy.deepcopy(config_template)
+ data_slots = DATA_SLOT_DICT[module]
+ for data_slot in data_slots:
+ sample_file = f"{cv_path}/samples_{split}{fold}.pkl"
+ logger.info(f"setting sample file {sample_file}")
+ config[data_slot]["dataset_config"]["sample_file"] = sample_file
+
+ if (module == "deeprvat") | (module == "deeprvat_pretrained"):
+ logger.info("Writing baseline directories")
+ old_baseline = copy.deepcopy(config["baseline_results"])
+ if fold_specific_baseline:
+ config["baseline_results"] = [
+ {"base": f'{r["base"]}/cv_split{fold}/baseline', "type": r["type"]}
+ for r in old_baseline
+ ]
+ logger.info(config["baseline_results"])
+ logger.info(f"Writing config for module {module}")
+ with open(f"{out_path}/{module_folder_dict[module]}/config.yaml", "w") as f:
+ yaml.dump(config, f)
+
+
+@cli.command()
+@click.option("--fold", type=int)
+@click.option("--n-folds", type=int, default=5)
+@click.argument("input_config", type=click.Path(exists=True))
+@click.argument("out_file", type=click.Path())
+def generate_test_config(input_config, out_file, fold, n_folds):
+ with open(input_config) as f:
+ config = yaml.safe_load(f)
+ cv_path = f"{config['cv_path']}/{n_folds}_fold"
+ split = "test"
+ sample_file = f"{cv_path}/samples_{split}{fold}.pkl"
+ logger.info(f"setting sample file {sample_file}")
+ for data_slot in DATA_SLOT_DICT["deeprvat"]:
+ config[data_slot]["dataset_config"]["sample_file"] = sample_file
+ with open(out_file, "w") as f:
+ yaml.dump(config, f)
+
+
+@cli.command()
+@click.option("--link-burdens", type=click.Path())
+@click.option("--burden-dirs", "-b", multiple=True)
+@click.argument("out_dir", type=click.Path(), default="./")
+@click.argument("config_file", type=click.Path(exists=True))
+def combine_test_set_burdens(
+ out_dir,
+ link_burdens,
+ burden_dirs,
+ config_file,
+):
+ with open(config_file) as f:
+ config = yaml.safe_load(f)
+ compression_level = 1
+ skip_burdens = link_burdens is not None
+ n_total_samples = []
+ for burden_dir in burden_dirs:
+ print(burden_dir)
+ this_y = zarr.open(f"{burden_dir}/y.zarr")
+ this_x = zarr.open(f"{burden_dir}/x.zarr")
+ # this_burdens = zarr.open(f'{burden_dir}/burdens.zarr')
+
+ assert this_y.shape[0] == this_x.shape[0] # == this_burdens.shape[0]
+ n_total_samples.append(this_y.shape[0])
+
+ n_total_samples = np.sum(n_total_samples)
+ print(f"Total number of samples {n_total_samples}")
+ if not skip_burdens:
+ this_burdens = zarr.open(
+ f"{burden_dir}/burdens.zarr"
+ ) # any burden tensor (here from the last file to get dims 1 -n)
+ burdens = zarr.open(
+ Path(out_dir) / "burdens.zarr",
+ mode="a",
+ shape=(n_total_samples,) + this_burdens.shape[1:],
+ chunks=(1000, 1000),
+ dtype=np.float32,
+ compressor=Blosc(clevel=compression_level),
+ )
+ print(f"burdens shape: {burdens.shape}")
+ else:
+ burdens = None
+
+ y = zarr.open(
+ Path(out_dir) / "y.zarr",
+ mode="a",
+ shape=(n_total_samples,) + this_y.shape[1:],
+ chunks=(None, None),
+ dtype=np.float32,
+ compressor=Blosc(clevel=compression_level),
+ )
+ x = zarr.open(
+ Path(out_dir) / "x.zarr",
+ mode="a",
+ shape=(n_total_samples,) + this_x.shape[1:],
+ chunks=(None, None),
+ dtype=np.float32,
+ compressor=Blosc(clevel=compression_level),
+ )
+
+ start_idx = 0
+
+ for burden_dir in burden_dirs:
+ this_y = zarr.open(f"{burden_dir}/y.zarr")[:]
+ end_idx = start_idx + this_y.shape[0]
+ this_x = zarr.open(f"{burden_dir}/x.zarr")[:]
+ if not skip_burdens:
+ logger.info("writing burdens")
+ this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")[:]
+ burdens[start_idx:end_idx] = this_burdens
+ print((start_idx, end_idx))
+ y[start_idx:end_idx] = this_y
+ x[start_idx:end_idx] = this_x
+ start_idx = end_idx
+
+ y_transformation = config["data"]["dataset_config"].get("y_transformation", None)
+ standardize_xpheno = config["data"]["dataset_config"].get(
+ "standardize_xpheno", True
+ )
+
+ ## Analogously to what is done in densegt
+ if standardize_xpheno:
+ this_x = x[:]
+ logger.info(" Standardizing combined covariates")
+ for col in range(this_x.shape[1]):
+ this_x[:, col] = standardize_series(this_x[:, col])
+ x[:] = this_x
+ if y_transformation is not None:
+ this_y = y[:]
+ n_unique_y_val = np.count_nonzero(~np.isnan(np.unique(this_y)))
+ if n_unique_y_val == 2:
+ logger.warning(
+ "Not applying y transformation because y only has two values and seems to be binary"
+ )
+ y_transformation = None
+ if y_transformation is not None:
+ if y_transformation == "standardize":
+ logger.info(" Standardizing combined target phenotype (y)")
+ for col in range(this_y.shape[1]):
+ this_y[:, col] = standardize_series(this_y[:, col])
+ elif y_transformation == "quantile_transform":
+ logger.info(f" Quantile transforming combined target phenotype (y)")
+ for col in range(this_y.shape[1]):
+ this_y[:, col] = my_quantile_transform(this_y[:, col])
+ y[:] = this_y
+ print("done")
+ if link_burdens is not None:
+ source_path = Path(out_dir) / "burdens.zarr"
+ source_path.unlink(missing_ok=True)
+ source_path.symlink_to(link_burdens)
+ genes = np.load(f"{burden_dirs[0]}/genes.npy")
+ np.save(Path(out_dir) / "genes.npy", genes)
+
+
+if __name__ == "__main__":
+ cli()
diff --git a/deeprvat/data/dense_gt.py b/deeprvat/data/dense_gt.py
index c02521d5..12247518 100644
--- a/deeprvat/data/dense_gt.py
+++ b/deeprvat/data/dense_gt.py
@@ -32,6 +32,31 @@
AGGREGATIONS = {"max": np.max, "sum": np.sum}
+## Move this to utils?
+def get_matched_sample_indices(x, y):
+ """
+ # this function is supposed to do the same as
+ # indices= np.array([np.where(x==iy)[0][0] for iy in y]) but is much faster
+ #https://stackoverflow.com/questions/8251541/numpy-for-every-element-in-one-array-find-the-index-in-another-array
+
+ Args:
+ x : query array
+ y: query values. The function returns the index of each element of y in x
+ Returns:
+ np.array: Index of each element of y in x
+ """
+ assert np.in1d(y, x).sum() == len(y), "All values of y must be in x"
+
+ xsorted = np.argsort(x)
+ ypos = np.searchsorted(x[xsorted], y)
+ x_indices = xsorted[ypos]
+
+ x_mask = np.zeros(np.shape(x)).astype(bool)
+ x_mask[x_indices] = True
+
+ return x_indices, x_mask
+
+
class DenseGTDataset(Dataset):
def __init__(
self,
@@ -61,6 +86,7 @@ def __init__(
y_phenotypes: List[str] = [],
skip_y_na: bool = True,
skip_x_na: bool = False,
+ sample_file: str = None,
sim_phenotype_file: Optional[str] = None,
min_common_variant_count: Optional[int] = None,
min_common_af: Optional[Dict[str, float]] = None,
@@ -86,6 +112,7 @@ def __init__(
else:
logger.setLevel(logging.INFO)
+ self.check_samples = True # TODO undo
self.split = split
self.train_dataset = train_dataset
self.chromosomes = (
@@ -131,10 +158,7 @@ def __init__(
f"Using phenotype file {phenotype_file} and genotype file {self.gt_filename}"
)
self.setup_phenotypes(
- phenotype_file,
- sim_phenotype_file,
- skip_y_na,
- skip_x_na,
+ phenotype_file, sim_phenotype_file, skip_y_na, skip_x_na, sample_file
)
self.max_rare_af = max_rare_af
@@ -152,9 +176,9 @@ def __init__(
if grouping_level is not None:
if grouping_level == "gene":
- self.grouping_column = "gene_ids"
+ self.grouping_column = "gene_id"
elif grouping_level == "exon":
- self.grouping_column = "exon_ids"
+ self.grouping_column = "exon_id"
else:
raise ValueError(f"Unknown aggregation level {grouping_level}")
else:
@@ -202,15 +226,14 @@ def __getitem__(self, idx: int) -> torch.tensor:
gt_file = h5py.File(self.gt_filename, "r")
self.variant_matrix = gt_file["variant_matrix"]
self.genotype_matrix = gt_file["genotype_matrix"]
-
if self.cache_matrices:
self.variant_matrix = self.variant_matrix[:]
self.genotype_matrix = self.genotype_matrix[:]
- idx = self.index_map[idx]
-
- sparse_variants = self.variant_matrix[idx, :]
- sparse_genotype = self.genotype_matrix[idx, :]
+ # idx_pheno = self.index_map_pheno[idx] #samples and phenotype is already subset so can use idx
+ idx_geno = self.index_map_geno[idx]
+ sparse_variants = self.variant_matrix[idx_geno, :]
+ sparse_genotype = self.genotype_matrix[idx_geno, :]
(
common_variants,
all_sparse_variants,
@@ -221,7 +244,9 @@ def __getitem__(self, idx: int) -> torch.tensor:
idx, all_sparse_variants, sparse_genotype
)
- phenotypes = self.phenotype_df.iloc[idx, :]
+ phenotypes = self.phenotype_df.iloc[
+ idx, :
+ ] # TODO use loc here self.phenotype_df.loc[self.samples[idx]]
x_phenotype_tensor = torch.tensor(
phenotypes[self.x_phenotypes].to_numpy(dtype=np.float32), dtype=torch.float
@@ -230,7 +255,9 @@ def __getitem__(self, idx: int) -> torch.tensor:
y = torch.tensor(
phenotypes[self.y_phenotypes].to_numpy(dtype=np.float32), dtype=torch.float
)
-
+ if self.check_samples:
+ # sanity check, can be removed in future
+ assert self.samples_gt[idx_geno] == self.samples[idx]
return {
"sample": self.samples[idx],
"x_phenotypes": x_phenotype_tensor,
@@ -256,9 +283,17 @@ def setup_phenotypes(
sim_phenotype_file: Optional[str],
skip_y_na: bool,
skip_x_na: bool,
+ sample_file: Optional[str],
):
logger.debug("Reading phenotype dataframe")
self.phenotype_df = pd.read_parquet(phenotype_file, engine="pyarrow")
+ with h5py.File(self.gt_filename, "r") as f:
+ samples_gt = f["samples"][:]
+ samples_gt = np.array([item.decode("utf-8") for item in samples_gt])
+ if self.check_samples:
+ self.samples_gt = samples_gt
+ samples_phenotype_df = np.array(self.phenotype_df.index)
+ # phenotypes_df has first to be sorted in the same order as samples_gt
if sim_phenotype_file is not None:
logger.info(
f"Using phenotypes and covariates from simulated phenotype file {sim_phenotype_file}"
@@ -267,24 +302,76 @@ def setup_phenotypes(
self.phenotype_df = self.phenotype_df.join(
sim_phenotype
) # TODO on = , validate = "1:1"
+ if sample_file is not None:
+ logger.info(f"Using samples from sample file {sample_file}")
+ with open(sample_file, "rb") as f:
+ samples_to_keep = pickle.load(f)
+ samples_to_keep = np.array(samples_to_keep)
+ logger.info(f"Number of samples in sample file: {len(samples_to_keep)}")
+ shared_samples = np.array(
+ list(set(samples_to_keep).intersection(set(samples_phenotype_df)))
+ )
+ if len(shared_samples) < len(samples_to_keep):
+ logger.warning(
+ "Some samples from the sample file were not found in the data"
+ )
+ sample_to_keep = shared_samples
+ logger.info(
+ f"Number of samples in sample file and in phenotype_df: {len(samples_to_keep)}"
+ )
+ else:
+ logger.info("Using all samples in phenotype df")
+ samples_to_keep = copy.deepcopy(samples_phenotype_df)
+ logger.info("Removing samples that are not in genotype file")
+
+ samples_to_keep = np.array(
+ list(set(samples_to_keep).intersection(set(samples_gt)))
+ )
binary_cols = [
c for c in self.y_phenotypes if self.phenotype_df[c].dtype == bool
]
-
+ # samples_to_keep_mask = [
+ # True if i in samples_to_keep else False
+ # for i in self.phenotype_df.index
+ # ]
+ # much faster retrieval of the mask compared to commented out list operation above
+ samples_to_keep_df = (
+ pd.Series(samples_to_keep, name="sample").to_frame().assign(mask=True)
+ )
+ merged_mask = (
+ pd.Series(self.phenotype_df.index, name="sample")
+ .to_frame()
+ .merge(samples_to_keep_df, how="left", validate="1:1", on="sample")
+ )
+ samples_to_keep_mask = list(merged_mask["mask"].fillna(False))
+ assert sum(samples_to_keep_mask) == len(samples_to_keep)
mask_cols = copy.deepcopy(self.x_phenotypes)
if skip_y_na:
mask_cols += self.y_phenotypes
if skip_x_na:
mask_cols += self.x_phenotypes
mask = (self.phenotype_df[mask_cols].notna()).all(axis=1)
+ mask &= samples_to_keep_mask
+ samples_to_keep = self.phenotype_df.index[mask]
self.n_samples = mask.sum()
- logger.info(
- f"Number of samples with phenotype and covariates: {self.n_samples}"
- )
+ logger.info(f"Final number of kept samples: {self.n_samples}")
+ self.phenotype_df = self.phenotype_df[mask]
self.samples = self.phenotype_df.index.to_numpy()
- self.index_map = np.arange(len(self.phenotype_df))[mask]
+ # account for the fact that genotypes.h5 and phenotype_df can have different
+ # orders of their samples
+ self.index_map_geno, _ = get_matched_sample_indices(
+ samples_gt.astype(int), self.samples.astype(int)
+ )
+ # get_matched_sample_indices is a much, much faster implementation of the code below
+ # self.index_map_geno = [np.where(samples_gt.astype(int) == i) for i in self.samples.astype(int)]
+
+ if self.check_samples:
+ # just a sanity check for get_matched_sample_indices, can be removed in future
+ for i in np.random.choice(len(self.samples), 100):
+ # print(i)
+ assert self.samples[i] == samples_gt[self.index_map_geno[i]]
def get_variant_ids(self, matrix_indices: np.ndarray) -> np.ndarray:
return self.variant_id_map.loc[matrix_indices, "id"].to_numpy()
@@ -475,6 +562,9 @@ def setup_annotations(
annotation_file, columns=list(set(columns)), engine="pyarrow"
).compute()
self.annotation_df = self.annotation_df.set_index("id")
+ self.gene_specific_anno = self.annotation_df["gene_id"].dtype != np.dtype(
+ "O"
+ )
if type(annotation_aggregation) == str:
self.annotation_aggregation = AGGREGATIONS.get(
@@ -517,9 +607,14 @@ def setup_variants(
logger.debug(f' {mask.sum()} variants "common" by count filter')
elif min_common_af is not None:
af_col, af_threshold = list(min_common_af.items())[0]
+ af_annotation = self.annotation_df[[af_col]].reset_index()
+ af_annotation = af_annotation.drop_duplicates()
+ if not len(af_annotation["id"]) == len(af_annotation["id"].unique()):
+ raise ValueError(
+ "Annotation dataframe has inconsistent allele frequency values"
+ )
variants_with_af = safe_merge(
- variants[["id"]].reset_index(drop=True),
- self.annotation_df[[af_col]].reset_index(),
+ variants[["id"]].reset_index(drop=True), af_annotation
)
assert np.all(
variants_with_af["id"].to_numpy() == variants["id"].to_numpy()
@@ -562,26 +657,22 @@ def setup_variants(
if self.gene_file is not None:
genes = set(pd.read_parquet(self.gene_file, columns=["id"])["id"])
logger.debug(f" Retaining {len(genes)} genes from {self.gene_file}")
- variants_with_gene_ids = safe_merge(
- variants[["id"]].reset_index(drop=True),
- self.annotation_df[["gene_ids"]].reset_index(),
- )
- assert np.all(
- variants_with_gene_ids["id"].to_numpy() == variants["id"].to_numpy()
+ ids_to_keep = (
+ self.annotation_df.reset_index()[["id", "gene_id"]]
+ .explode("gene_id")
+ .query("gene_id in @genes")["id"]
+ .unique()
)
- additional_mask &= (
- variants_with_gene_ids["gene_ids"]
- .apply(lambda x: len(set(x) & genes) != 0)
- .to_numpy()
- )
- del variants_with_gene_ids
+ additional_mask &= variants["id"].isin(ids_to_keep).to_numpy()
if self.gene_types_to_keep is not None:
+ raise NotImplementedError
additional_mask &= (
variants["gene_types"]
.apply(lambda x: len(set(x) & self.gene_types_to_keep) != 0)
.to_numpy()
)
if self.ignore_by_annotation is not None:
+ raise NotImplementedError
for col, val in self.ignore_by_annotation:
if self.annotation_df[col].dtype == np.dtype("object"):
additional_mask &= (
@@ -601,9 +692,12 @@ def setup_variants(
and self.gene_file is None
and self.gene_types_to_keep is None
):
- rare_variant_mask &= (
- variants["gene_ids"].apply(lambda x: len(x) > 0).to_numpy()
- )
+ if self.gene_specific_anno:
+ rare_variant_mask &= variants["gene_id"].notna().to_numpy()
+ else:
+ rare_variant_mask &= (
+ variants["gene_ids"].apply(lambda x: len(x) > 0).to_numpy()
+ )
variants["rare_variant_mask"] = rare_variant_mask
@@ -613,6 +707,7 @@ def setup_variants(
common_variant_mask &= ~af_mask
common_variant_mask &= additional_mask
if self.group_common:
+ raise NotImplementedError
common_variant_mask &= (
variants["gene_ids"].apply(lambda x: len(x) > 0).to_numpy()
)
@@ -665,6 +760,8 @@ def get_variant_metadata(self, grouping_level: Optional[str]):
self.setup_common_groups()
def setup_common_groups(self):
+ raise NotImplementedError()
+
logger.debug("Setting up groups for common variants")
logger.debug(" Computing grouping")
common_variant_groups = self.variants.loc[
diff --git a/deeprvat/data/rare.py b/deeprvat/data/rare.py
index 40de8446..12f65d41 100644
--- a/deeprvat/data/rare.py
+++ b/deeprvat/data/rare.py
@@ -24,7 +24,7 @@
logger = logging.getLogger(__name__)
-# this class is used DeepRVAT
+# this class is used in DeepRVAT
class PaddedAnnotations:
def __init__(
self,
@@ -48,13 +48,17 @@ def __init__(
self.low_memory = low_memory
self.stand_params = None
self.skip_embedding = skip_embedding
+ self.gene_specific_anno = base_dataset.gene_specific_anno
if self.base_dataset.train_dataset is not None:
logger.debug("Setting up based on training dataset")
train_embedding = self.base_dataset.train_dataset.rare_embedding
self.annotation_df = train_embedding.annotation_df
- self.exploded_annotations = train_embedding.exploded_annotations
- self.exploded_annotations_np = self.exploded_annotations_np
+ if self.gene_specific_anno:
+ self.annotation_df_np = self.annotation_df_np
+ else:
+ self.exploded_annotations = train_embedding.exploded_annotations
+ self.exploded_annotations_np = self.exploded_annotations_np
self.gene_map = train_embedding.gene_map
self.genes = train_embedding.genes
self.genes_np = train_embedding.genes_np
@@ -84,7 +88,8 @@ def __init__(
if self.low_memory:
logger.info(f" Cleaning up to save memory")
self.annotation_df = None
- self.exploded_annotations = None
+ if not self.gene_specific_anno:
+ self.exploded_annotations = None
self.base_dataset.annotation_df = None
def embed(
@@ -114,7 +119,10 @@ def embed(
# logger.info(f"rows {rows}")
for i in rows:
gene = self.gene_map[self.genes_np[i]] # NOTE: Changed
- result[gene].append(self.exploded_annotations_np[i, :])
+ if self.gene_specific_anno:
+ result[gene].append(self.annotation_df_np[i, :])
+ else:
+ result[gene].append(self.exploded_annotations_np[i, :])
return result
@@ -164,10 +172,18 @@ def setup_annotations(
else set(thresholds.keys()) & set(annotation_df.columns)
)
mask = annotation_df.index.isin(rare_variant_ids)
- mask &= annotation_df[self.grouping_column].apply(lambda x: len(x) > 0)
- annotation_df = annotation_df.loc[
- mask, set(self.annotations + [self.grouping_column] + threshold_cols)
- ].copy()
+ if self.gene_specific_anno:
+ mask &= annotation_df[self.grouping_column].notna()
+ annotation_df = annotation_df.loc[
+ mask,
+ list(set(self.annotations + [self.grouping_column] + threshold_cols)),
+ ].copy()
+ else:
+ mask &= annotation_df[self.grouping_column].apply(lambda x: len(x) > 0)
+ annotation_df = annotation_df.loc[
+ mask, set(self.annotations + [self.grouping_column] + threshold_cols)
+ ].copy()
+
# standardize here
if (
self.base_dataset.standardize_rare_anno
@@ -203,94 +219,164 @@ def setup_annotations(
# return standardization params
- logger.debug(" Exploding annotations by groups")
- annotation_df[self.grouping_column] = annotation_df[self.grouping_column].apply(
- lambda x: list(set(list(x)))
- )
+ if self.gene_specific_anno:
+ annotation_df = annotation_df[
+ list(set([self.grouping_column] + self.annotations + threshold_cols))
+ ]
+ else:
+ logger.debug(" Exploding annotations by groups")
+ annotation_df[self.grouping_column] = annotation_df[
+ self.grouping_column
+ ].apply(lambda x: list(set(list(x))))
- exploded_annotations = annotation_df[
- set([self.grouping_column] + self.annotations + threshold_cols)
- ].explode(self.grouping_column)
+ exploded_annotations = annotation_df[
+ set([self.grouping_column] + self.annotations + threshold_cols)
+ ].explode(self.grouping_column)
if gene_file is not None:
logger.debug(" Filtering by provided genes")
genes_df = pd.read_parquet(gene_file, columns=["id", "gene"])
genes = set(genes_df["id"])
- mask = exploded_annotations["gene_ids"].isin(genes)
+ if self.gene_specific_anno:
+ mask = annotation_df["gene_id"].isin(genes)
+ else:
+ mask = exploded_annotations["gene_id"].isin(genes)
if genes_to_keep is not None:
genes_to_keep_ids = set(
genes_df[genes_df["gene"].isin(genes_to_keep)]["id"]
)
- mask &= exploded_annotations["gene_ids"].isin(genes_to_keep_ids)
- exploded_annotations = exploded_annotations[mask]
- annotation_df = annotation_df[
- annotation_df.index.isin(exploded_annotations.index)
- ]
+ if self.gene_specific_anno:
+ mask &= annotation_df["gene_id"].isin(genes_to_keep_ids)
+ else:
+ mask &= exploded_annotations["gene_id"].isin(genes_to_keep_ids)
- self.annotation_df = annotation_df[set(self.annotations + threshold_cols)]
- self.exploded_annotations = exploded_annotations[
- set([self.grouping_column] + self.annotations + threshold_cols)
- ].astype({self.grouping_column: np.int32})
+ if self.gene_specific_anno:
+ annotation_df = annotation_df[mask]
+ else:
+ exploded_annotations = exploded_annotations[mask]
+ annotation_df = annotation_df[
+ annotation_df.index.isin(exploded_annotations.index)
+ ]
+
+ if self.gene_specific_anno:
+ self.annotation_df = annotation_df[
+ list(set([self.grouping_column] + self.annotations + threshold_cols))
+ ].astype({self.grouping_column: np.int32})
+ else:
+ self.annotation_df = annotation_df[set(self.annotations + threshold_cols)]
+ self.exploded_annotations = exploded_annotations[
+ set([self.grouping_column] + self.annotations + threshold_cols)
+ ].astype({self.grouping_column: np.int32})
- if len(self.exploded_annotations) == 0:
+ if (
+ len(self.annotation_df)
+ if self.gene_specific_anno
+ else len(self.exploded_annotations)
+ ) == 0:
raise RuntimeError(f"No rare variants found in provided genes")
def apply_thresholds(self, thresholds: Optional[Dict[str, str]]):
if thresholds is not None:
self.annotation_df["mask"] = True
- self.exploded_annotations["mask"] = True
+ if not self.gene_specific_anno:
+ self.exploded_annotations["mask"] = True
for op in thresholds.values():
self.annotation_df["mask"] &= self.annotation_df.eval(op)
- self.exploded_annotations["mask"] &= self.exploded_annotations.eval(op)
+ if not self.gene_specific_anno:
+ self.exploded_annotations["mask"] &= self.exploded_annotations.eval(
+ op
+ )
self.annotation_df = self.annotation_df[self.annotation_df["mask"]]
- self.exploded_annotations = self.exploded_annotations[
- self.exploded_annotations["mask"]
+ if not self.gene_specific_anno:
+ self.exploded_annotations = self.exploded_annotations[
+ self.exploded_annotations["mask"]
+ ]
+
+ if self.gene_specific_anno:
+ self.annotation_df = self.annotation_df[
+ [self.grouping_column] + self.annotations
]
+ self.kept_variants = np.sort(self.annotation_df.index.unique().to_numpy())
+ self.variant_map[self.kept_variants] = np.arange(
+ self.kept_variants.shape[0]
+ )
- self.annotation_df = self.annotation_df[self.annotations]
- self.exploded_annotations = self.exploded_annotations[
- [self.grouping_column] + self.annotations
- ]
- self.kept_variants = np.sort(self.annotation_df.index.to_numpy())
- assert np.all(self.kept_variants == np.unique(self.kept_variants))
- self.variant_map[self.kept_variants] = np.arange(len(self.annotation_df))
+ if self.kept_variants.shape[0] == 0:
+ raise RuntimeError(f" No variants passed thresholding")
- if len(self.annotation_df) == 0:
- raise RuntimeError(f" No variants passed thresholding")
+ logger.info(f" {self.kept_variants.shape[0]} variants passed thresholding")
- logger.info(f" {len(self.annotation_df)} variants passed thresholding")
+ self.annotation_df_np = self.annotation_df[self.annotations].to_numpy()
+ self.genes_np = copy.deepcopy(
+ self.annotation_df[self.grouping_column].to_numpy()
+ )
+ else:
+ self.annotation_df = self.annotation_df[self.annotations]
+ self.exploded_annotations = self.exploded_annotations[
+ [self.grouping_column] + self.annotations
+ ]
+ self.kept_variants = np.sort(self.annotation_df.index.to_numpy())
+ assert np.all(self.kept_variants == np.unique(self.kept_variants))
+ self.variant_map[self.kept_variants] = np.arange(len(self.annotation_df))
- self.exploded_annotations_np = self.exploded_annotations[
- self.annotations
- ].to_numpy()
- self.genes_np = copy.deepcopy(
- self.exploded_annotations[self.grouping_column].to_numpy()
- )
+ if len(self.annotation_df) == 0:
+ raise RuntimeError(f" No variants passed thresholding")
+
+ logger.info(f" {len(self.annotation_df)} variants passed thresholding")
+
+ self.exploded_annotations_np = self.exploded_annotations[
+ self.annotations
+ ].to_numpy()
+ self.genes_np = copy.deepcopy(
+ self.exploded_annotations[self.grouping_column].to_numpy()
+ )
def remap_group_ids(self):
- self.gene_map = -(2**24) * np.ones(
- self.exploded_annotations[self.grouping_column].max() + 1, dtype=np.int32
+ max_gene_id = (
+ self.annotation_df[self.grouping_column].max()
+ if self.gene_specific_anno
+ else self.exploded_annotations[self.grouping_column].max()
+ )
+ self.gene_map = -(2**24) * np.ones(max_gene_id + 1, dtype=np.int32)
+ self.genes = (
+ np.sort(self.annotation_df[self.grouping_column].unique())
+ if self.gene_specific_anno
+ else np.sort(self.exploded_annotations[self.grouping_column].unique())
)
- self.genes = np.sort(self.exploded_annotations[self.grouping_column].unique())
self.n_genes = len(self.genes)
logger.info(
f"Found {self.n_genes} genes with rare variants " "that pass thresholds"
)
self.gene_map[self.genes] = np.arange(self.genes.shape[0])
- self.exploded_annotations[self.grouping_column] = self.gene_map[
- self.exploded_annotations[self.grouping_column].to_numpy()
- ]
+ if self.gene_specific_anno:
+ self.annotation_df[self.grouping_column] = self.gene_map[
+ self.annotation_df[self.grouping_column].to_numpy()
+ ]
+ else:
+ self.exploded_annotations[self.grouping_column] = self.gene_map[
+ self.exploded_annotations[self.grouping_column].to_numpy()
+ ]
def setup_metadata(self):
logger.debug(" Precomputing integer indices for exploded dataframe")
+ max_variant_id = (
+ np.max(self.kept_variants)
+ if self.gene_specific_anno
+ else self.annotation_df.index.max()
+ )
self.exp_anno_id_indices = [
- np.array([], dtype=np.int32)
- for _ in range(self.annotation_df.index.max() + 1)
+ np.array([], dtype=np.int32) for _ in range(max_variant_id + 1)
]
- for i in range(len(self.exploded_annotations)):
- j = self.exploded_annotations.index[i]
- self.exp_anno_id_indices[j] = np.append(self.exp_anno_id_indices[j], i)
+
+ if self.gene_specific_anno:
+ for i in range(len(self.annotation_df)):
+ j = self.annotation_df.index[i]
+ self.exp_anno_id_indices[j] = np.append(self.exp_anno_id_indices[j], i)
+ else:
+ for i in range(len(self.exploded_annotations)):
+ j = self.exploded_annotations.index[i]
+ self.exp_anno_id_indices[j] = np.append(self.exp_anno_id_indices[j], i)
def get_metadata(self) -> Dict[str, np.ndarray]:
return {
@@ -301,7 +387,7 @@ def get_metadata(self) -> Dict[str, np.ndarray]:
}
-# #this class is used for the seed gene discovery
+# this class is used for the seed gene discovery
class SparseGenotype:
def __init__(
self,
@@ -321,6 +407,7 @@ def __init__(
self.grouping_column = base_dataset.grouping_column
self.stand_params = None
self.low_memory = low_memory
+ self.gene_specific_anno = self.base_dataset.gene_specific_anno
self.max_variant_id = base_dataset.variants.index[
base_dataset.variants["rare_variant_mask"]
@@ -330,12 +417,21 @@ def __init__(
logger.debug("Setting up based on training dataset")
train_embedding = self.base_dataset.train_dataset.rare_embedding
self.annotation_df = train_embedding.annotation_df
- self.exploded_annotations = train_embedding.exploded_annotations
- self.exploded_annotations_np = self.exploded_annotations_np
+ if self.gene_specific_anno:
+ self.annotation_df_np = self.annotation_df_np
+ else:
+ self.exploded_annotations = train_embedding.exploded_annotations
+ self.exploded_annotations_np = self.exploded_annotations_np
self.gene_map = train_embedding.gene_map
self.genes = train_embedding.genes
else:
logger.debug("Setting up annotations")
+ if self.gene_specific_anno:
+ assert base_dataset.variants.index.name == "id"
+ if "id" in base_dataset.variants.columns:
+ assert (
+ base_dataset.variants.index == base_dataset.variants["id"]
+ ).all()
rare_variant_ids = base_dataset.variants.index[
base_dataset.variants["rare_variant_mask"]
]
@@ -355,7 +451,8 @@ def __init__(
if self.low_memory:
logger.info(f" Cleaning up to save memory")
self.annotation_df = None
- self.exploded_annotations = None
+ if not self.gene_specific_anno:
+ self.exploded_annotations = None
self.base_dataset.annotation_df = None
def embed(
@@ -392,16 +489,25 @@ def setup_annotations(
logger.debug(" Filtering by rare variant IDs and by gene")
annotation_df = self.base_dataset.annotation_df
+
threshold_cols = list(
set()
if thresholds is None
else set(thresholds.keys()) & set(annotation_df.columns)
)
mask = annotation_df.index.isin(rare_variant_ids)
- mask &= annotation_df[self.grouping_column].apply(lambda x: len(x) > 0)
- annotation_df = annotation_df.loc[
- mask, set(self.annotations + [self.grouping_column] + threshold_cols)
- ].copy()
+ if self.gene_specific_anno:
+ mask &= annotation_df[self.grouping_column].notna()
+ annotation_df = annotation_df.loc[
+ mask,
+ list(set(self.annotations + [self.grouping_column] + threshold_cols)),
+ ].copy()
+ else:
+ mask &= annotation_df[self.grouping_column].apply(lambda x: len(x) > 0)
+ annotation_df = annotation_df.loc[
+ mask, set(self.annotations + [self.grouping_column] + threshold_cols)
+ ].copy()
+
# standardize here
if (
self.base_dataset.standardize_rare_anno
@@ -437,83 +543,156 @@ def setup_annotations(
# return standardization params
- logger.debug(" Exploding annotations by groups")
- annotation_df[self.grouping_column] = annotation_df[self.grouping_column].apply(
- lambda x: list(set(list(x)))
- )
- exploded_annotations = annotation_df[
- set([self.grouping_column] + self.annotations + threshold_cols)
- ].explode(self.grouping_column)
- if gene_file is not None:
- logger.debug(" Filtering by provided genes")
- genes_df = pd.read_parquet(gene_file, columns=["id", "gene"])
- genes = set(genes_df["id"])
- mask = exploded_annotations["gene_ids"].isin(genes)
-
- if genes_to_keep is not None:
- genes_to_keep_ids = set(
- genes_df[genes_df["gene"].isin(genes_to_keep)]["id"]
- )
- mask &= exploded_annotations["gene_ids"].isin(genes_to_keep_ids)
-
- exploded_annotations = exploded_annotations[mask]
+ if self.gene_specific_anno:
annotation_df = annotation_df[
- annotation_df.index.isin(exploded_annotations.index)
+ list(set([self.grouping_column] + self.annotations + threshold_cols))
]
+ if gene_file is not None:
+ logger.debug(" Filtering by provided genes")
+ genes_df = pd.read_parquet(gene_file, columns=["id", "gene"])
+ genes = set(genes_df["id"])
+ mask = annotation_df["gene_id"].isin(genes)
+ if genes_to_keep is not None:
+ genes_to_keep_ids = set(
+ genes_df[genes_df["gene"].isin(genes_to_keep)]["id"]
+ )
+ mask &= annotation_df["gene_id"].isin(genes_to_keep_ids)
+ annotation_df = annotation_df[mask]
- self.annotation_df = annotation_df[set(self.annotations + threshold_cols)]
- self.exploded_annotations = exploded_annotations[
- set([self.grouping_column] + self.annotations + threshold_cols)
- ].astype({self.grouping_column: np.int32})
+ self.annotation_df = annotation_df[
+ list(set([self.grouping_column] + self.annotations + threshold_cols))
+ ].astype({self.grouping_column: np.int32})
+ else:
+ logger.debug(" Exploding annotations by groups")
+ annotation_df[self.grouping_column] = annotation_df[
+ self.grouping_column
+ ].apply(lambda x: list(set(list(x))))
+ exploded_annotations = annotation_df[
+ set([self.grouping_column] + self.annotations + threshold_cols)
+ ].explode(self.grouping_column)
+
+ if gene_file is not None:
+ logger.debug(" Filtering by provided genes")
+ genes_df = pd.read_parquet(gene_file, columns=["id", "gene"])
+ genes = set(genes_df["id"])
+ mask = exploded_annotations["gene_id"].isin(genes)
+
+ if genes_to_keep is not None:
+ genes_to_keep_ids = set(
+ genes_df[genes_df["gene"].isin(genes_to_keep)]["id"]
+ )
+ mask &= exploded_annotations["gene_id"].isin(genes_to_keep_ids)
+
+ exploded_annotations = exploded_annotations[mask]
+ annotation_df = annotation_df[
+ annotation_df.index.isin(exploded_annotations.index)
+ ]
+
+ self.annotation_df = annotation_df[set(self.annotations + threshold_cols)]
+ self.exploded_annotations = exploded_annotations[
+ set([self.grouping_column] + self.annotations + threshold_cols)
+ ].astype({self.grouping_column: np.int32})
+
+ if len(self.annotation_df) == 0:
+ raise RuntimeError(f"No rare variants found in provided genes")
def apply_thresholds(self, thresholds: Optional[Dict[str, str]]):
- if thresholds is not None:
- self.annotation_df["mask"] = True
- self.exploded_annotations["mask"] = True
- for op in thresholds.values():
- self.annotation_df["mask"] &= self.annotation_df.eval(op)
- self.exploded_annotations["mask"] &= self.exploded_annotations.eval(op)
- self.annotation_df = self.annotation_df[self.annotation_df["mask"]]
- self.exploded_annotations = self.exploded_annotations[
- self.exploded_annotations["mask"]
+ if self.gene_specific_anno:
+ if thresholds is not None:
+ self.annotation_df["mask"] = True
+ for op in thresholds.values():
+ self.annotation_df["mask"] &= self.annotation_df.eval(op)
+
+ self.annotation_df = self.annotation_df[self.annotation_df["mask"]]
+
+ self.annotation_df = self.annotation_df[
+ [self.grouping_column] + self.annotations
]
+ self.kept_variants = np.sort(self.annotation_df.index.unique().to_numpy())
+ self.variant_map[self.kept_variants] = np.arange(
+ self.kept_variants.shape[0]
+ )
- self.annotation_df = self.annotation_df[self.annotations]
- self.exploded_annotations = self.exploded_annotations[
- [self.grouping_column] + self.annotations
- ]
- self.exploded_annotations_np = self.exploded_annotations[
- self.annotations
- ].to_numpy()
+ if self.kept_variants.shape[0] == 0:
+ raise RuntimeError(f" No variants passed thresholding")
- self.kept_variants = np.sort(self.annotation_df.index.to_numpy())
- assert np.all(self.kept_variants == np.unique(self.kept_variants))
- self.variant_map[self.kept_variants] = np.arange(len(self.annotation_df))
+ logger.info(f" {self.kept_variants.shape[0]} variants passed thresholding")
+
+ self.annotation_df_np = self.annotation_df[self.annotations].to_numpy()
+ else:
+ if thresholds is not None:
+ self.annotation_df["mask"] = True
+ self.exploded_annotations["mask"] = True
+ for op in thresholds.values():
+ self.annotation_df["mask"] &= self.annotation_df.eval(op)
+ self.exploded_annotations["mask"] &= self.exploded_annotations.eval(
+ op
+ )
+
+ self.annotation_df = self.annotation_df[self.annotation_df["mask"]]
+ self.exploded_annotations = self.exploded_annotations[
+ self.exploded_annotations["mask"]
+ ]
+
+ self.annotation_df = self.annotation_df[self.annotations]
+ self.exploded_annotations = self.exploded_annotations[
+ [self.grouping_column] + self.annotations
+ ]
+ self.exploded_annotations_np = self.exploded_annotations[
+ self.annotations
+ ].to_numpy()
+
+ self.kept_variants = np.sort(self.annotation_df.index.to_numpy())
+ assert np.all(self.kept_variants == np.unique(self.kept_variants))
+ self.variant_map[self.kept_variants] = np.arange(len(self.annotation_df))
def remap_group_ids(self):
- self.gene_map = -(2**24) * np.ones(
- self.exploded_annotations[self.grouping_column].max() + 1, dtype=np.int32
+ max_gene_id = (
+ self.annotation_df[self.grouping_column].max()
+ if self.gene_specific_anno
+ else self.exploded_annotations[self.grouping_column].max()
+ )
+ self.gene_map = -(2**24) * np.ones(max_gene_id + 1, dtype=np.int32)
+ genes_dup = (
+ self.annotation_df[self.grouping_column]
+ if self.gene_specific_anno
+ else self.exploded_annotations[self.grouping_column]
)
- self.genes = np.sort(self.exploded_annotations[self.grouping_column].unique())
+ self.genes = np.sort(genes_dup.unique())
self.n_genes = len(self.genes)
logger.info(
f"Found {self.n_genes} genes with rare variants " "that pass thresholds"
)
self.gene_map[self.genes] = np.arange(self.genes.shape[0])
- self.exploded_annotations[self.grouping_column] = self.gene_map[
- self.exploded_annotations[self.grouping_column].to_numpy()
- ]
+ if self.gene_specific_anno:
+ self.annotation_df[self.grouping_column] = self.gene_map[
+ self.annotation_df[self.grouping_column].to_numpy()
+ ]
+ else:
+ self.exploded_annotations[self.grouping_column] = self.gene_map[
+ self.exploded_annotations[self.grouping_column].to_numpy()
+ ]
def setup_metadata(self):
logger.debug(" Precomputing integer indices for exploded dataframe")
+ max_variant_id = (
+ np.max(self.kept_variants)
+ if self.gene_specific_anno
+ else self.annotation_df.index.max()
+ )
self.exp_anno_id_indices = [
- np.array([], dtype=np.int32)
- for _ in range(self.annotation_df.index.max() + 1)
+ np.array([], dtype=np.int32) for _ in range(max_variant_id + 1)
]
- for i in range(len(self.exploded_annotations)):
- j = self.exploded_annotations.index[i]
- self.exp_anno_id_indices[j] = np.append(self.exp_anno_id_indices[j], i)
+
+ if self.gene_specific_anno:
+ for i in range(len(self.annotation_df)):
+ j = self.annotation_df.index[i]
+ self.exp_anno_id_indices[j] = np.append(self.exp_anno_id_indices[j], i)
+ else:
+ for i in range(len(self.exploded_annotations)):
+ j = self.exploded_annotations.index[i]
+ self.exp_anno_id_indices[j] = np.append(self.exp_anno_id_indices[j], i)
def get_metadata(self) -> Dict[str, np.ndarray]:
return {
diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py
index f24d3fa3..5af2e770 100644
--- a/deeprvat/deeprvat/associate.py
+++ b/deeprvat/deeprvat/associate.py
@@ -13,15 +13,17 @@
import dask.dataframe as dd
import numpy as np
import pandas as pd
+import pyranges as pr
import torch
import torch.nn as nn
import statsmodels.api as sm
import yaml
+from bgen import BgenWriter
from numcodecs import Blosc
from seak import scoretest
from statsmodels.tools.tools import add_constant
from torch.utils.data import DataLoader, Dataset, Subset
-from tqdm import tqdm
+from tqdm import tqdm, trange
import zarr
import re
@@ -30,7 +32,7 @@
logging.basicConfig(
format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
- level="INFO",
+ level=logging.INFO,
stream=sys.stdout,
)
logger = logging.getLogger(__name__)
@@ -44,6 +46,8 @@
"Consequence_splice_donor_variant",
]
+AGG_FCT = {"mean": np.mean, "max": np.max}
+
def get_burden(
batch: Dict,
@@ -300,7 +304,7 @@ def compute_burdens_(
Path(cache_dir) / "burdens.zarr",
mode="a",
shape=(n_total_samples,) + this_burdens.shape[1:],
- chunks=(1000, 1000),
+ chunks=(1000, 1000, 1),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
@@ -332,7 +336,6 @@ def compute_burdens_(
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
-
start_idx = i * batch_size
end_idx = min(start_idx + batch_size, chunk_end) # read from chunk shape
@@ -366,6 +369,285 @@ def compute_burdens_(
return ds_full.rare_embedding.genes, burdens, y, x, sample_ids
+def make_regenie_input_(
+ debug: bool,
+ skip_samples: bool,
+ skip_covariates: bool,
+ skip_phenotypes: bool,
+ skip_burdens: bool,
+ repeat: int,
+ average_repeats: bool,
+ phenotype: Tuple[Tuple[str, Path, Path]],
+ sample_file: Optional[Path],
+ covariate_file: Optional[Path],
+ phenotype_file: Optional[Path],
+ bgen: Optional[Path],
+ gene_file: Path,
+ gtf: Path,
+):
+ ## Check options
+ if (repeat >= 0) + average_repeats + skip_burdens != 1:
+ raise ValueError(
+ "Exactly one of --repeat or --average-repeats or --skip-burdens must be specified"
+ )
+ if not skip_samples and sample_file is None:
+ raise ValueError("Either sample_file or skip_samples must be specified")
+ if not skip_covariates and covariate_file is None:
+ raise ValueError("Either covariate_file or skip_covariates must be specified")
+ if not skip_phenotypes and phenotype_file is None:
+ raise ValueError("Either phenotype_file or skip_phenotypes must be specified")
+ if not skip_burdens and bgen is None:
+ raise ValueError("Either bgen or skip_burdens must be specified")
+
+ ## Make BGEN
+
+ # Load data
+ logger.info("Loading computed burdens, covariates, phenotypes and metadata")
+
+ phenotype_names = [p[0] for p in phenotype]
+ dataset_files = [p[1] for p in phenotype]
+ burden_dirs = [p[2] for p in phenotype]
+
+ sample_ids = zarr.load(burden_dirs[0] / "sample_ids.zarr")
+ covariates = zarr.load(burden_dirs[0] / "x.zarr")
+ ys = [zarr.load(b / "y.zarr") for b in burden_dirs]
+ genes = np.load(burden_dirs[0] / "genes.npy")
+
+ if debug:
+ sample_ids = sample_ids[:1000]
+ covariates = covariates[:1000]
+ ys = [y[:1000] for y in ys]
+
+ n_samples = sample_ids.shape[0]
+ n_genes = genes.shape[0]
+ assert covariates.shape[0] == n_samples
+ assert all([y.shape[0] == n_samples for y in ys])
+
+ # Sanity check: sample_ids, covariates, and genes should be consistent for all phenotypes
+ # TODO: Check burdens as well (though this will be slow)
+ if not debug:
+ for i in range(1, len(phenotype)):
+ assert np.array_equal(
+ sample_ids, zarr.load(burden_dirs[i] / "sample_ids.zarr")
+ )
+ assert np.array_equal(
+ covariates, zarr.load(burden_dirs[i] / "x.zarr")
+ ) # TODO: Phenotype-specific covariates
+ assert np.array_equal(genes, np.load(burden_dirs[i] / "genes.npy"))
+
+ sample_df = pd.DataFrame({"FID": sample_ids, "IID": sample_ids})
+
+ if not skip_samples:
+ ## Make sample file
+ logger.info(f"Creating sample file {sample_file}")
+ samples_out = pd.concat(
+ [
+ pd.DataFrame({"ID_1": 0, "ID_2": 0}, index=[0]),
+ sample_df.rename(
+ columns={
+ "FID": "ID_1",
+ "IID": "ID_2",
+ }
+ ),
+ ]
+ )
+ samples_out.to_csv(sample_file, sep=" ", index=False)
+
+ if not skip_covariates:
+ ## Make covariate file
+ logger.info(f"Creating covariate file {covariate_file}")
+ with open(dataset_files[0], "rb") as f:
+ dataset = pickle.load(f)
+
+ covariate_names = dataset.x_phenotypes
+ cov_df = pd.DataFrame(covariates, columns=covariate_names)
+ cov_df = pd.concat([sample_df, cov_df], axis=1)
+ cov_df.to_csv(covariate_file, sep=" ", index=False, na_rep="NA")
+
+ if not skip_phenotypes:
+ ## Make phenotype file
+ logger.info(f"Creating phenotype file {phenotype_file}")
+ pheno_df_list = []
+ for p, y in zip(phenotype_names, ys):
+ pheno_df_list.append(pd.DataFrame({p: y.squeeze()}))
+
+ pheno_df = pd.concat([sample_df] + pheno_df_list, axis=1)
+ pheno_df.to_csv(phenotype_file, sep=" ", index=False, na_rep="NA")
+
+ if not skip_burdens:
+ logger.warning(
+ "Using burdens from first phenotype passed. "
+ "Burdens from other phenotypes will be ignored."
+ )
+ burdens_zarr = zarr.open(burden_dirs[0] / "burdens.zarr")
+ if not debug:
+ assert burdens_zarr.shape[0] == n_samples
+ assert burdens_zarr.shape[1] == n_genes
+
+ if average_repeats:
+ logger.info("Averaging burdens across all repeats")
+ burdens = np.zeros((n_samples, n_genes))
+ for repeat in trange(burdens_zarr.shape[2]):
+ burdens += burdens_zarr[:n_samples, :, repeat]
+ burdens = burdens / burdens_zarr.shape[2]
+ else:
+ logger.info(f"Using burdens from repeat {repeat}")
+ assert repeat < burdens_zarr.shape[2]
+ burdens = burdens_zarr[:n_samples, :, repeat]
+
+ # Read GTF file and get positions for pseudovariants (center of interval [Start, End])
+ logger.info(
+ f"Assigning positions to pseudovariants based on provided GTF file {gtf}"
+ )
+ gene_pos = pr.read_gtf(gtf)
+ gene_pos = gene_pos[
+ (gene_pos.Feature == "gene") & (gene_pos.gene_type == "protein_coding")
+ ][["Chromosome", "Start", "End", "gene_id"]].as_df()
+ gene_pos = gene_pos.set_index("gene_id")
+ gene_metadata = pd.read_parquet(gene_file).set_index("id")
+ this_gene_pos = gene_pos.loc[gene_metadata.loc[genes, "gene"]]
+ pseudovar_pos = (this_gene_pos.End - this_gene_pos.Start).to_numpy().astype(int)
+ ensgids = this_gene_pos.index.to_numpy()
+
+ logger.info(f"Writing pseudovariants to {bgen}")
+ with BgenWriter(
+ bgen,
+ n_samples,
+ samples=list(sample_ids),
+ metadata="Pseudovariants containing DeepRVAT gene impairment scores. One pseudovariant per gene.",
+ ) as f:
+ for i in trange(n_genes):
+ varid = f"pseudovariant_gene_{ensgids[i]}"
+ this_burdens = burdens[:, i] # Rescale scores to be in range (0, 2)
+ genotypes = np.stack(
+ (this_burdens, np.zeros(this_burdens.shape), 1 - this_burdens),
+ axis=1,
+ )
+
+ f.add_variant(
+ varid=varid,
+ rsid=varid,
+ chrom=this_gene_pos.iloc[i].Chromosome,
+ pos=pseudovar_pos[i],
+ alleles=[
+ "A",
+ "C",
+ ], # TODO: This is completely arbitrary, however, we might want to match it to a reference FASTA at some point
+ genotypes=genotypes,
+ ploidy=2,
+ bit_depth=16,
+ )
+
+
+@cli.command()
+@click.option("--debug", is_flag=True)
+@click.option("--skip-samples", is_flag=True)
+@click.option("--skip-covariates", is_flag=True)
+@click.option("--skip-phenotypes", is_flag=True)
+@click.option("--skip-burdens", is_flag=True)
+@click.option("--repeat", type=int, default=-1)
+@click.option("--average-repeats", is_flag=True)
+@click.option(
+ "--phenotype",
+ type=(
+ str,
+ click.Path(exists=True, path_type=Path),
+ click.Path(exists=True, path_type=Path),
+ ),
+ multiple=True,
+) # phenotype_name, dataset_file, burden_dir
+@click.option("--sample-file", type=click.Path(path_type=Path))
+@click.option("--bgen", type=click.Path(path_type=Path))
+@click.option("--covariate-file", type=click.Path(path_type=Path))
+@click.option("--phenotype-file", type=click.Path(path_type=Path))
+# @click.argument("dataset-file", type=click.Path(exists=True, path_type=Path))
+# @click.argument("burden-dir", type=click.Path(exists=True, path_type=Path))
+@click.argument("gene-file", type=click.Path(exists=True, path_type=Path))
+@click.argument("gtf", type=click.Path(exists=True, path_type=Path))
+def make_regenie_input(
+ debug: bool,
+ skip_samples: bool,
+ skip_covariates: bool,
+ skip_phenotypes: bool,
+ skip_burdens: bool,
+ repeat: int,
+ average_repeats: bool,
+ phenotype: Tuple[Tuple[str, Path, Path]],
+ sample_file: Optional[Path],
+ covariate_file: Optional[Path],
+ phenotype_file: Optional[Path],
+ bgen: Optional[Path],
+ gene_file: Path,
+ gtf: Path,
+):
+ make_regenie_input_(
+ debug=debug,
+ skip_samples=skip_samples,
+ skip_covariates=skip_covariates,
+ skip_phenotypes=skip_phenotypes,
+ skip_burdens=skip_burdens,
+ repeat=repeat,
+ average_repeats=average_repeats,
+ phenotype=phenotype,
+ sample_file=sample_file,
+ covariate_file=covariate_file,
+ phenotype_file=phenotype_file,
+ bgen=bgen,
+ gene_file=gene_file,
+ gtf=gtf,
+ )
+
+
+def convert_regenie_output_(
+ repeat: int, phenotype: Tuple[str, Tuple[Path, Path]], gene_file: Path
+):
+ genes = pd.read_parquet(gene_file)[["id", "gene"]]
+ for pheno_name, regenie_results, out_file in phenotype:
+ regenie_cols = ["TEST", "SE", "CHISQ"]
+ regenie_col_newnames = [f"regenie_{c}" for c in regenie_cols]
+ result_df = pd.read_csv(regenie_results, sep=" ")[
+ ["ID", "BETA", "LOG10P"] + regenie_cols
+ ]
+
+ result_df["gene"] = result_df["ID"].str.split("_", expand=True)[2]
+ old_len = len(result_df)
+ result_df = pd.merge(result_df, genes, validate="1:1")
+ assert len(result_df) == old_len
+ result_df = result_df.drop(columns="ID")
+ result_df = result_df.drop(columns="gene").rename(columns={"id": "gene"})
+
+ result_df["phenotype"] = pheno_name
+ result_df = result_df.rename(columns={"BETA": "beta"})
+ result_df["pval"] = np.power(10, -result_df["LOG10P"])
+ result_df = result_df.drop(columns="LOG10P")
+ result_df["model"] = f"repeat_{repeat}"
+ result_df = result_df.rename(
+ columns=dict(zip(regenie_cols, regenie_col_newnames))
+ )
+ result_df = result_df[
+ ["phenotype", "gene", "beta", "pval", "model"] + regenie_col_newnames
+ ]
+ result_df.to_parquet(out_file)
+
+
+@cli.command()
+@click.option("--repeat", type=int, default=0)
+@click.option(
+ "--phenotype",
+ type=(
+ str,
+ click.Path(exists=True, path_type=Path), # REGENIE output file
+ click.Path(path_type=Path), # Converted results
+ ),
+ multiple=True,
+)
+@click.argument("gene-file", type=click.Path(exists=True, path_type=Path))
+def convert_regenie_output(
+ repeat: int, phenotype: Tuple[str, Tuple[Path, Path]], gene_file: Path
+):
+ convert_regenie_output_(repeat, phenotype, gene_file)
+
+
def load_one_model(
config: Dict,
checkpoint: str,
@@ -427,16 +709,24 @@ def reverse_models(
logger.info("Using CPU")
device = torch.device("cpu")
- plof_df = (
- dd.read_parquet(
- annotation_file,
- columns=data_config["data"]["dataset_config"]["rare_embedding"]["config"][
- "annotations"
- ],
- )
- .query(" or ".join([f"{c} == 1" for c in PLOF_COLS]))
- .compute()
+ # plof_df = (
+ # dd.read_parquet(
+ # annotation_file,
+ # columns=data_config["data"]["dataset_config"]["rare_embedding"]["config"][
+ # "annotations"
+ # ],
+ # )
+ # .query(" or ".join([f"{c} == 1" for c in PLOF_COLS]))
+ # .compute()
+ # )
+
+ plof_df = pd.read_parquet(
+ annotation_file,
+ columns=data_config["data"]["dataset_config"]["rare_embedding"]["config"][
+ "annotations"
+ ],
)
+ plof_df = plof_df[plof_df[PLOF_COLS].eq(1).any(axis=1)]
plof_zero_df = plof_df.copy()
plof_zero_df.loc[:, PLOF_COLS] = 0.0
@@ -449,7 +739,6 @@ def reverse_models(
if Path(checkpoint + ".dropped").is_file():
# Ignore checkpoints that were chosen to be dropped
continue
-
agg_model = load_one_model(data_config, checkpoint, device=device)
score = agg_model(
torch.tensor(plof, dtype=torch.float, device=device).reshape(
@@ -667,6 +956,7 @@ def regress_on_gene_scoretest(
:rtype: Tuple[List[str], List[float], List[float]]
"""
burdens = burdens.reshape(burdens.shape[0], -1)
+ assert np.all(burdens != 0) # TODO check this!
logger.info(f"Burdens shape: {burdens.shape}")
if np.all(np.abs(burdens) < 1e-6):
@@ -852,6 +1142,7 @@ def regress_(
@click.option("--repeat", type=int, default=0)
@click.option("--do-scoretest", is_flag=True)
@click.option("--sample-file", type=click.Path(exists=True))
+@click.option("--burden-file", type=click.Path(exists=True))
@click.argument("config-file", type=click.Path(exists=True))
@click.argument("burden-dir", type=click.Path(exists=True))
@click.argument("out-dir", type=click.Path())
@@ -867,6 +1158,7 @@ def regress(
out_dir: str,
do_scoretest: bool,
sample_file: Optional[str],
+ burden_file: Optional[str],
):
"""
Perform regression analysis.
@@ -896,8 +1188,13 @@ def regress(
:return: Regression results saved to out_dir as "burden_associations_{chunk}.parquet"
"""
logger.info("Loading saved burdens")
+ # if burden_file is not None:
+ # logger.info(f'Loading burdens from {burden_file}')
+ # burdens = zarr.open(burden_file)[:, :, repeat]
+ # else:
+ # burdens = zarr.open(Path(burden_dir) / "burdens.zarr")[:, :, repeat]
+ logger.info(f"Loading x, y, genes from {burden_dir}")
y = zarr.open(Path(burden_dir) / "y.zarr")[:]
- burdens = zarr.open(Path(burden_dir) / "burdens.zarr")[:, :, repeat]
x_pheno = zarr.open(Path(burden_dir) / "x.zarr")[:]
genes = pd.Series(np.load(Path(burden_dir) / "genes.npy"))
@@ -906,18 +1203,18 @@ def regress(
samples = pickle.load(f)["association_samples"]
if debug:
samples = [s for s in samples if s < 1000]
- burdens = burdens[samples]
+ # burdens = burdens[samples]
y = y[samples]
x_pheno = x_pheno[samples]
- n_samples = burdens.shape[0]
+ n_samples = y.shape[0]
assert y.shape[0] == n_samples
assert x_pheno.shape[0] == n_samples
- assert len(genes) == burdens.shape[1]
+ # assert len(genes) == burdens.shape[1]
nan_mask = ~np.isnan(y).squeeze()
y = y[nan_mask]
- burdens = burdens[nan_mask]
+ # burdens = burdens[nan_mask]
x_pheno = x_pheno[nan_mask]
with open(config_file) as f:
@@ -934,8 +1231,23 @@ def regress(
chunk_end = min(len(genes), chunk_start + chunk_size)
if chunk == n_chunks - 1:
assert chunk_end == len(genes)
- gene_indices = np.arange(chunk_start, chunk_end)
+ # gene_indices = np.arange(chunk_start, chunk_end)
+
genes = genes.iloc[chunk_start:chunk_end]
+ gene_indices = np.arange(len(genes))
+ logger.info(f"Only extracting genes in range {chunk_start, chunk_end}")
+ if burden_file is not None:
+ logger.info(f"Loading burdens from {burden_file}")
+ burdens = zarr.open(burden_file)[:, chunk_start:chunk_end, repeat]
+ else:
+ burdens = zarr.open(Path(burden_dir) / "burdens.zarr")[
+ :, chunk_start:chunk_end, repeat
+ ]
+
+ if sample_file is not None:
+ burdens = burdens[samples]
+ burdens = burdens[nan_mask]
+ assert len(genes) == burdens.shape[1]
associations = regress_(
config,
@@ -985,5 +1297,300 @@ def combine_regression_results(
results.to_parquet(out_file, engine="pyarrow")
+@cli.command()
+@click.option("--n-chunks", type=int)
+@click.option("--chunk", type=int)
+@click.option("-r", "--repeats", multiple=True, type=int)
+@click.option("--agg-fct", type=str, default="mean")
+@click.argument("burden-file", type=click.Path(exists=True))
+@click.argument("burden-out-file", type=click.Path())
+def average_burdens(
+ repeats: Tuple,
+ burden_file: str,
+ burden_out_file: str,
+ agg_fct: Optional[str] = "mean",
+ n_chunks: Optional[int] = None,
+ chunk: Optional[int] = None,
+):
+ compression_level = 1
+ logger.info(f"Analyzing repeats {repeats}")
+ logger.info(f"Reading burdens to aggregate from {burden_file}")
+ burdens = zarr.open(burden_file)
+ n_total_samples = burdens.shape[0]
+ if chunk is not None:
+ if n_chunks is None:
+ raise ValueError("n_chunks must be specified if chunk is not None")
+ chunk_length = math.ceil(n_total_samples / n_chunks)
+ chunk_start = chunk * chunk_length
+ chunk_end = min(n_total_samples, chunk_start + chunk_length)
+ samples = range(chunk_start, chunk_end)
+ n_samples = len(samples)
+ print(chunk_start, chunk_end)
+ else:
+ n_samples = n_total_samples
+ chunk_start = 0
+ chunk_end = n_samples
+
+ logger.info(
+ f"Computing result for chunk {chunk} out of {n_chunks} in range {chunk_start}, {chunk_end}"
+ )
+
+ batch_size = 100
+ logger.info(f"Batch size: {batch_size}")
+ n_batches = n_samples // batch_size + (n_samples % batch_size != 0)
+
+ logger.info(f"Using aggregation function {agg_fct}")
+ for i in tqdm(
+ range(n_batches),
+ file=sys.stdout,
+ total=(n_samples // batch_size + (n_samples % batch_size != 0)),
+ ):
+ if i == 0:
+ # if not os.path.exists(burden_out_file):
+ # logger.info('Generting new zarr file')
+ burdens_new = zarr.open(
+ burden_out_file,
+ mode="a",
+ shape=(burdens.shape[0], burdens.shape[1], 1),
+ chunks=(1000, 1000),
+ dtype=np.float32,
+ compressor=Blosc(clevel=compression_level),
+ )
+ # else:
+ # logger.info('Only opening zarr file')
+ # burdens_new = zarr.open(burden_out_file)
+
+ start_idx = chunk_start + i * batch_size
+ end_idx = min(start_idx + batch_size, chunk_end)
+ print(start_idx, end_idx)
+ this_burdens = np.take(burdens[start_idx:end_idx, :, :], repeats, axis=2)
+ this_burdens = AGG_FCT[agg_fct](this_burdens, axis=2)
+
+ burdens_new[start_idx:end_idx, :, 0] = this_burdens
+
+ logger.info(
+ f"Writing aggregted burdens in range {chunk_start}, {chunk_end} to {burden_out_file}"
+ )
+
+
+# TODO merge these functions into regress(), regress_
+@cli.command()
+@click.option("--debug", is_flag=True)
+@click.option("--chunk", type=int, default=0)
+@click.option("--n-chunks", type=int, default=1)
+@click.option("--use-bias", is_flag=True)
+@click.option("--gene-file", type=click.Path(exists=True))
+@click.option("--repeat", type=int, default=0)
+@click.option("--do-scoretest", is_flag=True)
+@click.option("--sample-file", type=click.Path(exists=True))
+@click.option("--burden-file", type=click.Path(exists=True))
+@click.option("--genes-to-keep", type=click.Path(exists=True))
+@click.option("--common-genotype-prefix", type=str)
+@click.argument("config-file", type=click.Path(exists=True))
+@click.argument("burden-dir", type=click.Path(exists=True))
+@click.argument("out-file", type=click.Path())
+def regress_common(
+ debug: bool,
+ chunk: int,
+ n_chunks: int,
+ use_bias: bool,
+ gene_file: str,
+ repeat: int,
+ config_file: str,
+ burden_dir: str,
+ out_file: str,
+ do_scoretest: bool,
+ sample_file: Optional[str],
+ burden_file: Optional[str],
+ genes_to_keep: Optional[str],
+ common_genotype_prefix: str,
+):
+ logger.info("Loading saved burdens")
+ # if burden_file is not None:
+ # logger.info(f'Loading burdens from {burden_file}')
+ # burdens = zarr.open(burden_file)[:, :, repeat]
+ # else:
+ # burdens = zarr.open(Path(burden_dir) / "burdens.zarr")[:, :, repeat]
+ logger.info(f"Loading x, y, genes from {burden_dir}")
+ y = zarr.open(Path(burden_dir) / "y.zarr")[:]
+ x_pheno = zarr.open(Path(burden_dir) / "x.zarr")[:]
+ genes = pd.Series(np.load(Path(burden_dir) / "genes.npy"))
+
+ if genes_to_keep is not None:
+ logger.info(f"Reading genes_to_keep file from {genes_to_keep}")
+ genes_to_keep = np.load(genes_to_keep)
+
+ if sample_file is not None:
+ with open(sample_file, "rb") as f:
+ samples = pickle.load(f)["association_samples"]
+ if debug:
+ samples = [s for s in samples if s < 1000]
+ # burdens = burdens[samples]
+ y = y[samples]
+ x_pheno = x_pheno[samples]
+
+ n_samples = y.shape[0]
+ assert y.shape[0] == n_samples
+ assert x_pheno.shape[0] == n_samples
+ # assert len(genes) == burdens.shape[1]
+
+ # TODO commented this out. is this a problem?
+ # nan_mask = ~np.isnan(y).squeeze()
+ # y = y[nan_mask]
+ # # burdens = burdens[nan_mask]
+ # x_pheno = x_pheno[nan_mask]
+
+ with open(config_file) as f:
+ config = yaml.safe_load(f)
+
+ if gene_file is not None:
+ logger.info("Loading gene names")
+ gene_df = pd.read_parquet(gene_file, engine="pyarrow")
+ gene_df.set_index("id")
+ genes = gene_df.loc[genes, "gene"].str.split(".").apply(lambda x: x[0])
+
+ chunk_size = math.ceil(len(genes) / n_chunks)
+ chunk_start = chunk * chunk_size
+ chunk_end = min(len(genes), chunk_start + chunk_size)
+ if chunk == n_chunks - 1:
+ assert chunk_end == len(genes)
+ # gene_indices = np.arange(chunk_start, chunk_end)
+
+ # gene_indices = np.arange(chunk_start, chunk_end)
+ logger.info(f"processing genes in range {chunk_start}, {chunk_end}")
+ all_genes = copy.deepcopy(genes)
+ genes = genes.iloc[chunk_start:chunk_end]
+ if genes_to_keep is not None:
+ # genes_this_chunk = set(genes).intersection(set(genes_to_keep))
+ genes_this_chunk = [
+ i for i in genes_to_keep if i in list(genes)
+ ] # having list is important, otherwise 'in' checks the indices, not the values in the pd.Series
+ gene_indices = np.array(
+ [np.where(all_genes == this_gene)[0][0] for this_gene in genes_this_chunk]
+ )
+ genes = pd.Series(list(genes_this_chunk))
+
+ logger.info(f"Only extracting genes in range {chunk_start, chunk_end}")
+
+ if burden_file is not None:
+ logger.info(f"Loading burdens from {burden_file}")
+ else:
+ burden_file = Path(burden_dir) / "burdens.zarr"
+
+ if genes_to_keep is not None:
+ logger.info(f"Loading burdens at position {gene_indices}")
+ burdens = zarr.open(burden_file)
+ burdens = burdens.oindex[:, gene_indices, repeat]
+ gene_indices = np.arange(len(genes))
+ else:
+ burdens = zarr.open(burden_file)[:, chunk_start:chunk_end, repeat]
+
+ gene_indices = np.arange(len(genes))
+
+ if sample_file is not None:
+ burdens = burdens[samples]
+ # burdens = burdens[nan_mask]
+ assert len(genes) == burdens.shape[1]
+ logger.info(common_genotype_prefix)
+ associations = regress_common_(
+ config,
+ use_bias,
+ burdens,
+ y,
+ gene_indices,
+ genes,
+ x_pheno,
+ common_genotype_prefix,
+ do_scoretest=do_scoretest,
+ )
+
+ logger.info("Saving results")
+ # Path(out_dir).mkdir(parents=True, exist_ok=True)
+ # associations.to_parquet(
+ # Path(out_dir) / f"burden_associations_{chunk}.parquet",
+ # engine="pyarrow",
+ # )
+ associations.to_parquet(Path(out_file), engine="pyarrow")
+
+
+def regress_common_(
+ config: Dict,
+ use_bias: bool,
+ burdens: np.ndarray,
+ y: np.ndarray,
+ gene_indices: np.ndarray,
+ genes: pd.Series,
+ x_pheno: np.ndarray,
+ common_genotype_prefix: str,
+ use_x_pheno: bool = True,
+ do_scoretest: bool = True,
+) -> pd.DataFrame:
+ assert len(gene_indices) == len(genes)
+ logger.info(common_genotype_prefix)
+
+ logger.info(f"Computing associations")
+ logger.info(f"Covariates shape: {x_pheno.shape}, y shape: {y.shape}")
+
+ regressed_genes = []
+ betas = []
+ pvals = []
+ logger.info("Running regression on each gene")
+ genes_betas_pvals = []
+ # for i, gene in tqdm(
+ # zip(gene_indices, genes), total=genes.shape[0], file=sys.stdout):
+ mask = ~np.isnan(y).reshape(-1)
+ y = y[mask]
+ for i, gene in zip(gene_indices, genes):
+ logger.info(f"rergressing on gene {gene}")
+ if common_genotype_prefix is not None:
+ logger.info(
+ f"Reading commong genotypes from {common_genotype_prefix}_{gene}.zarr"
+ )
+ common_genotypes = zarr.open(Path(f"{common_genotype_prefix}_{gene}.zarr"))[
+ :
+ ]
+
+ logger.info(f"common genotypes shape: {common_genotypes.shape}")
+
+ assert common_genotypes.shape[0] == x_pheno.shape[0]
+ X = np.hstack((x_pheno, common_genotypes))
+ if do_scoretest:
+ logger.info("Running regression on each gene using scoretest from SEAK")
+ X = np.hstack((np.ones((X.shape[0], 1)), X))[mask]
+ # adding bias column
+ logger.info(f"X shape: {X.shape}, Y shape: {y.shape}")
+
+ # compute null_model for score test
+ if len(np.unique(y)) == 2:
+ logger.info(
+ "Fitting binary model since only found two distinct y values"
+ )
+ model_score = scoretest.ScoretestLogit(y, X)
+ else:
+ logger.info("Fitting linear model")
+ model_score = scoretest.ScoretestNoK(y, X)
+ gene_stats = regress_on_gene_scoretest(gene, burdens[mask, i], model_score)
+ else:
+ logger.info("Running regression on each gene using OLS")
+ gene_stats = regress_on_gene(
+ gene, burdens[:, i], y, X, use_bias, use_x_pheno
+ )
+
+ genes_betas_pvals.append(gene_stats)
+ genes_betas_pvals = [x for x in genes_betas_pvals if x is not None]
+ regressed_genes, betas, pvals = separate_parallel_results(genes_betas_pvals)
+ y_phenotypes = config["data"]["dataset_config"]["y_phenotypes"]
+ regressed_phenotypes = [y_phenotypes] * len(regressed_genes)
+ result = pd.DataFrame(
+ {
+ "phenotype": itertools.chain(*regressed_phenotypes),
+ "gene": itertools.chain(*regressed_genes),
+ "beta": itertools.chain(*betas),
+ "pval": itertools.chain(*pvals),
+ }
+ )
+ return result
+
+
if __name__ == "__main__":
cli()
diff --git a/deeprvat/deeprvat/common_variant_condition_utils.py b/deeprvat/deeprvat/common_variant_condition_utils.py
new file mode 100644
index 00000000..417a006f
--- /dev/null
+++ b/deeprvat/deeprvat/common_variant_condition_utils.py
@@ -0,0 +1,220 @@
+# # Implement a a pipeline that re-tests significant associations but controlling for common variants
+
+import pandas as pd
+import pyranges as pr
+import pandas as pd
+from pyarrow.parquet import ParquetFile
+import scipy as sp
+import pickle
+import numpy as np
+import zarr
+from pathlib import Path
+from numcodecs import Blosc
+import logging
+from deeprvat.utils import pval_correction, standardize_series
+import click
+import sys
+import yaml
+import shutil
+import os
+
+
+compression_level = 1
+
+logging.basicConfig(
+ format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
+ level="INFO",
+ stream=sys.stdout,
+)
+logger = logging.getLogger(__name__)
+
+
+@click.group()
+def cli():
+ pass
+
+
+@cli.command()
+@click.option("--pval-correction-method", type=str, default="Bonferroni")
+@click.option("--alpha", type=str, default=0.05)
+@click.option("--debug", is_flag=True)
+@click.argument("config-file", type=click.Path(exists=True))
+@click.argument("res-file", type=click.Path(exists=True))
+@click.argument("out-parquet", type=click.Path())
+@click.argument("out-gene-ids", type=click.Path())
+def get_significant_genes(
+ alpha,
+ pval_correction_method,
+ debug,
+ config_file,
+ res_file,
+ out_parquet,
+ out_gene_ids,
+):
+
+ with open(config_file) as f:
+ config = yaml.safe_load(f)
+
+ gene_file = config["data"]["dataset_config"]["gene_file"]
+ logger.info(f"reading gene file from {gene_file}")
+
+ gene_df = pd.read_parquet(gene_file)
+ gene_df = gene_df.rename(columns={"id": "gene", "gene": "ensembl_id_version"})
+
+ logger.info(
+ f"reading association testing resultsf from {res_file} and re-doing multiple testing correction using {pval_correction_method}"
+ )
+ res = pd.read_parquet(res_file)
+ res = pval_correction(res, alpha=alpha, correction_type=pval_correction_method)
+
+ sig_genes = res.query("significant")[["gene", "pval", "pval_corrected"]].merge(
+ gene_df
+ )
+ logger.info(f"number of significant genes {len(sig_genes)}")
+ sig_genes = sig_genes.set_index("gene")
+ logger.info(sig_genes)
+ if debug:
+ sig_genes = sig_genes.head(2)
+
+ sig_genes.to_parquet(out_parquet)
+ genes_npy = np.array(sig_genes.index)
+ np.save(out_gene_ids, genes_npy)
+
+
+@cli.command()
+@click.option("--gtf-file", type=click.Path(exists=True))
+@click.option("--padding", type=int, default=0)
+@click.option("--standardize", is_flag=True)
+@click.argument("config-file", type=click.Path(exists=True))
+@click.argument("sig-gene-file", type=click.Path(exists=True))
+@click.argument("genotype-file", type=click.Path(exists=True))
+@click.argument("sample-file") # , type=click.Path(exists=True))
+@click.argument("out-dir", type=click.Path())
+def prepare_genotypes_per_gene(
+ standardize: bool,
+ gtf_file: str,
+ padding: int,
+ sample_file: str,
+ sig_gene_file: str,
+ genotype_file,
+ config_file: str,
+ out_dir: str,
+):
+
+ # Get the path to the active Conda environment
+ conda_env_path = os.environ.get("CONDA_PREFIX")
+
+ # Check if a Conda environment is activated
+ if conda_env_path:
+ logger.info(f"Active Conda environment: {conda_env_path}")
+ else:
+ logger.info("No Conda environment is currently activated.")
+ fillna = True
+
+ sig_genes = pd.read_parquet(sig_gene_file)
+ logger.info(f"Number of significant genes: {len(sig_genes)}")
+
+ logger.info(f"reading ordered samples (as in x/y/burdens.zarr) from {sample_file}")
+ ordered_sample_ids = zarr.open(sample_file)[:]
+ ordered_sample_ids = [int(i) for i in ordered_sample_ids]
+ n_total_samples = len(ordered_sample_ids)
+ logger.info(f"total number of samples: {n_total_samples}")
+
+ logger.info(f"reading genome annotation file from {gtf_file}")
+ genome_annotation = pr.read_gtf(gtf_file)
+ gene_annotation = genome_annotation[genome_annotation.Feature == "gene"]
+
+ logger.info(f"reading clumped genotypes from {genotype_file}")
+
+ var_names = ParquetFile(genotype_file).schema.names
+ split_var_names = pd.Series(var_names[6:]).str.split(":", expand=True)
+
+ variants = pr.from_dict(
+ {
+ "Chromosome": split_var_names[0].astype(str),
+ "Start": split_var_names[1].astype(int),
+ "End": split_var_names[1].astype(int) + 1,
+ "var_name": pd.Series(var_names[6:]),
+ }
+ )
+
+ logger.info(f"Using padding {padding}bp around each gene")
+
+ genes_with_no_variants = []
+ for gene_id in list(sig_genes.index):
+ gene_ensembl_id = sig_genes.loc[gene_id]["ensembl_id_version"].split(".")[0]
+ logger.info(f"writing genotypes for gene {gene_id}, {gene_ensembl_id}")
+ gene_annotation_expanded = gene_annotation.copy()
+ gene_annotation_expanded.Start = gene_annotation_expanded.Start - padding
+ gene_annotation_expanded.End = gene_annotation_expanded.End + padding
+
+ included_vars = variants.intersect(
+ gene_annotation_expanded[
+ gene_annotation_expanded.gene_id.str.startswith(gene_ensembl_id)
+ ]
+ )
+ included_vars = (
+ included_vars.as_df()["var_name"].to_list() if included_vars else []
+ )
+
+ if len(included_vars) > 0:
+ logger.info(
+ f"Loading genotypes for {len(included_vars)} variants in gene region"
+ )
+ ref_ac_df = pd.read_parquet(genotype_file, columns=["IID"] + included_vars)
+
+ selected_genos = pd.Series(ordered_sample_ids, name="IID").to_frame()
+
+ selected_genos = selected_genos.merge(ref_ac_df, how="left", on="IID")
+ selected_genos = selected_genos.rename(columns={"IID": "sample"}).set_index(
+ "sample"
+ )
+ assert all(selected_genos.index == ordered_sample_ids)
+
+ if fillna:
+ logger.info("Filling nan genotypes with 0")
+ selected_genos = selected_genos.fillna(0)
+ logger.info(
+ "taking 2 - AC to get minor allele counts since plink returns reference allele counts"
+ )
+ selected_genos = 2 - selected_genos
+ logger.info("summary of minor allele frquencies")
+
+ logger.info(
+ (selected_genos.sum(axis=0) / (2 * len(selected_genos))).describe()
+ )
+
+ if standardize:
+ logger.info(" Standardizing input genotypes")
+ for col in selected_genos:
+ selected_genos[col] = standardize_series(selected_genos[col])
+ this_genos = np.array(selected_genos)
+ logger.info(this_genos.shape)
+ else:
+ logger.info("Gene has no variants, just writing array of zeros ")
+ this_genos = np.zeros((n_total_samples, 1))
+ genes_with_no_variants.append(gene_ensembl_id)
+
+ out_file = Path(out_dir) / f"genotypes_gene_{gene_id}.zarr"
+ if os.path.exists(out_file):
+ logger.info(f"removing existing zarr file {out_file}")
+ shutil.rmtree(out_file)
+
+ gene_x = zarr.open(
+ out_file,
+ mode="a",
+ shape=(n_total_samples,) + this_genos.shape[1:],
+ chunks=(None, None),
+ dtype=np.float32,
+ compressor=Blosc(clevel=compression_level),
+ )
+ gene_x[:] = this_genos
+
+ logger.info(
+ f"Genes with no variants: {len(genes_with_no_variants), genes_with_no_variants}"
+ )
+ logger.info("finished")
+
+
+if __name__ == "__main__":
+ cli()
diff --git a/deeprvat/deeprvat/config.py b/deeprvat/deeprvat/config.py
index 1d4de29d..7627c7bb 100644
--- a/deeprvat/deeprvat/config.py
+++ b/deeprvat/deeprvat/config.py
@@ -25,7 +25,7 @@ def cli():
@cli.command()
-@click.option("--seed-gene-dir", type=click.Path(exists=True))
+@click.option("--association-only", is_flag=True)
@click.option("--phenotype", type=str)
@click.option("--baseline-results", type=click.Path(exists=True), multiple=True)
@click.option("--baseline-results-out", type=click.Path())
@@ -33,23 +33,23 @@ def cli():
@click.argument("old_config_file", type=click.Path(exists=True))
@click.argument("new_config_file", type=click.Path())
def update_config(
- old_config_file: str,
+ association_only: bool,
phenotype: Optional[str],
- seed_gene_dir: Optional[str],
baseline_results: Tuple[str],
baseline_results_out: Optional[str],
seed_genes_out: Optional[str],
+ old_config_file: str,
new_config_file: str,
):
"""
Select seed genes based on baseline results and update the configuration file.
+ :param association_only: Update config file only for association testing
+ :type association_only: bool
:param old_config_file: Path to the old configuration file.
:type old_config_file: str
:param phenotype: Phenotype to update in the configuration.
:type phenotype: Optional[str]
- :param seed_gene_dir: Directory containing seed genes.
- :type seed_gene_dir: Optional[str]
:param baseline_results: Paths to baseline result files.
:type baseline_results: Tuple[str]
:param baseline_results_out: Path to save the updated baseline results.
@@ -63,9 +63,9 @@ def update_config(
Selected seed genes saved to seed_genes_out.parquet.
Optionally, save baseline results to a parquet file if baseline_results_out is specified.
"""
- if seed_gene_dir is None and len(baseline_results) == 0:
+ if not association_only and len(baseline_results) == 0:
raise ValueError(
- "One of --seed-gene-dir and --baseline-results " "must be specified"
+ "One of --baseline-results or --association-only must be specified"
)
with open(old_config_file) as f:
@@ -74,7 +74,8 @@ def update_config(
if phenotype is not None:
logger.info(f"Updating config for phenotype {phenotype}")
config["data"]["dataset_config"]["y_phenotypes"] = [phenotype]
- config["training_data"]["dataset_config"]["y_phenotypes"] = [phenotype]
+ if not association_only:
+ config["training_data"]["dataset_config"]["y_phenotypes"] = [phenotype]
# For using seed genes from results of baseline methods
if len(baseline_results) > 0:
@@ -86,7 +87,7 @@ def update_config(
)
seed_config = config["phenotypes"][phenotype]
correction_method = seed_config.get("correction_method", None)
- min_seed_genes = seed_config.get("min_seed_genes", None)
+ min_seed_genes = seed_config.get("min_seed_genes", 3)
max_seed_genes = seed_config.get("max_seed_genes", None)
threshold = seed_config.get("pvalue_threshold", None)
assert (
@@ -105,28 +106,34 @@ def update_config(
]
)
if "EAC" in baseline_df:
+ # filter for genes with expected allele count > 50 (as done by Karcewski et al.)
baseline_df = baseline_df.query("EAC > 50")
else:
logger.info("Not performing EAC filtering of baseline results")
logger.info(f" Correcting p-values using {correction_method} method")
+ alpha = config.get("alpha_seed_genes", config.get("alpha"))
baseline_df = pval_correction(
- baseline_df, config["alpha"], correction_type=correction_method
+ baseline_df, alpha, correction_type=correction_method
)
-
baseline_df = baseline_df.sort_values("pval_corrected")
if baseline_results_out is not None:
baseline_df.to_parquet(baseline_results_out, engine="pyarrow")
-
if correction_method is not None:
- if len(baseline_df.query("significant")) < 5:
+
+ logger.info(f"Using significant genes with corrected pval < {alpha}")
+ if (
+ len(baseline_df.query("significant")["gene"].unique())
+ < min_seed_genes
+ ):
logger.info(
- "Selecting top 5 genes from baseline because less than 5 genes are significant"
+ f"Selecting top {min_seed_genes} genes from baseline because less than {min_seed_genes} genes are significant"
)
- baseline_df = baseline_df.head(5) # TODO make this flexible
+ baseline_df = baseline_df.drop_duplicates(subset="gene").head(
+ min_seed_genes
+ ) # TODO make this flexible
else:
baseline_df = baseline_df.query("significant")
- logger.info(f" {len(baseline_df)} significant genes from baseline")
else:
if threshold is not None:
baseline_temp = baseline_df.query(f"pval_corrected < @threshold")
@@ -145,6 +152,7 @@ def update_config(
)
baseline_df = baseline_df.drop_duplicates(subset="gene")
+ logger.info(f" {len(baseline_df)} significant genes from baseline")
genes = pd.read_parquet(
config["data"]["dataset_config"]["gene_file"], engine="pyarrow"
diff --git a/deeprvat/deeprvat/evaluate.py b/deeprvat/deeprvat/evaluate.py
index 4e671da6..77188be4 100644
--- a/deeprvat/deeprvat/evaluate.py
+++ b/deeprvat/deeprvat/evaluate.py
@@ -2,13 +2,17 @@
import sys
from pathlib import Path
from typing import Dict, Optional, Tuple
+from itertools import combinations
+import random
+import os
import click
import numpy as np
import pandas as pd
import yaml
+from seak.cct import cct
-from deeprvat.utils import pval_correction
+from deeprvat.utils import pval_correction, bfcorrect_df
logging.basicConfig(
format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
@@ -42,7 +46,7 @@ def get_baseline(
phenotype=None,
min_eaf=50,
alpha: float = 0.05,
- correction_method: str = "FDR",
+ correction_method: str = "Bonferroni",
) -> pd.DataFrame:
baseline = pd.concat([pd.read_parquet(p) for p in paths])
if "EAC" in baseline.columns:
@@ -57,7 +61,7 @@ def get_baseline(
df = pval_correction(baseline, alpha, correction_type=correction_method)
df["experiment_group"] = experiment_name
- df["correction_method"] = "FDR"
+ df["correction_method"] = correction_method
df["experiment"] = "Baseline"
return df
@@ -68,7 +72,7 @@ def get_baseline_results(
pheno,
deeprvat_genes: np.ndarray,
alpha: float = 0.05,
- correction_method: str = "FDR",
+ correction_method: str = "Bonferroni",
):
min_eaf = config.get("min_eaf_baseline", 50)
@@ -80,56 +84,98 @@ def get_baseline_results(
): f"{r['base']}/{pheno}/{r['type']}/eval/burden_associations.parquet"
for r in config["baseline_results"]
}
-
+ logger.info(f"reading baseline from {baseline_paths}")
for (t, m), p in baseline_paths.items():
- result_list.append(
- get_baseline(
- [p],
- f"baseline_{t}_{m}",
- deeprvat_genes,
- phenotype=pheno,
- min_eaf=min_eaf,
- alpha=alpha,
- correction_method=correction_method,
+ if os.path.exists(p):
+ result_list.append(
+ get_baseline(
+ [p],
+ f"baseline_{t}_{m}",
+ deeprvat_genes,
+ phenotype=pheno,
+ min_eaf=min_eaf,
+ alpha=alpha,
+ correction_method=correction_method,
+ )
)
- )
-
- return pd.concat(result_list)
+ else:
+ logger.warning(f"Baseline path {p} doesn't exist")
+ if len(result_list) > 0:
+ res = pd.concat(result_list)
+ else:
+ logger.warning("No baseline data set existed. Returning empty data frame")
+ res = pd.DataFrame()
+ return res
def combine_results(
deeprvat_results: pd.DataFrame,
baseline_results: pd.DataFrame,
- correction_method: str = "FDR",
+ correction_method: str = "Bonferroni",
alpha: float = 0.05,
+ combine_pval: str = "Bonferroni",
):
baseline_original = baseline_results.copy()
baseline_original["Discovery type"] = "Baseline"
- deeprvat_results["Discovery type"] = "New DeepRVAT discovery"
- baseline_results["Discovery type"] = "Seed gene"
- combined_results = pd.concat([deeprvat_results, baseline_results])
+ deeprvat_results["Discovery type"] = "DeepRVAT discovery"
- combined = pval_correction(
- combined_results, alpha, correction_type=correction_method
+ deeprvat_results = pval_correction(
+ deeprvat_results, alpha, correction_type=correction_method
)
baseline_combined = baseline_original.copy()
baseline_combined["experiment_group"] = "baseline_combined"
- baseline_combined = pval_correction(
- baseline_combined, alpha, correction_type=correction_method
- )
- combined["experiment"] = "DeepRVAT"
- combined["experiment_group"] = "DeepRVAT"
- combined["correction_method"] = correction_method
+ if len(baseline_original) > 0:
+ if combine_pval is not None:
+ print("Aggregating baseline pvalues to one pvalue per gene")
+ baseline_combined = aggregate_pvals_per_gene(
+ baseline_combined, combine_pval
+ )
+ # should only be one pval per gene left
+ assert baseline_combined.groupby("gene").size().unique() == np.array([1])
+ baseline_combined = pval_correction(
+ baseline_combined, alpha, correction_type=correction_method
+ )
+ baseline_original_corrected = pd.DataFrame()
+ for method in baseline_original["experiment_group"].unique():
+ this_corrected = pval_correction(
+ baseline_original.copy().query("experiment_group == @method"),
+ alpha,
+ correction_type=correction_method,
+ )
+ baseline_original_corrected = pd.concat(
+ [baseline_original_corrected, this_corrected]
+ )
+ # just for sanity check
+ logger.info("Number of tests for each baseline method")
+ baseline_original_corrected["n_tests"] = (
+ baseline_original_corrected["pval_corrected"]
+ / baseline_original_corrected["pval"]
+ )
+ logger.info(
+ baseline_original_corrected.groupby("experiment_group")["n_tests"].unique()
+ )
+ baseline_original_corrected = baseline_original_corrected.drop(
+ columns="n_tests"
+ )
+ ######
+ else:
+ baseline_original_corrected = baseline_original
- combined = pd.concat([combined, baseline_original, baseline_combined])
+ deeprvat_results["experiment"] = "DeepRVAT"
+ deeprvat_results["experiment_group"] = "DeepRVAT"
+ deeprvat_results["correction_method"] = correction_method
+
+ combined = pd.concat(
+ [deeprvat_results, baseline_original_corrected, baseline_combined]
+ )
combined["-log10pval"] = -np.log10(combined["pval"])
combined["Discovery type"] = pd.Categorical(
- combined["Discovery type"], ["New DeepRVAT discovery", "Seed gene", "Baseline"]
+ combined["Discovery type"], ["DeepRVAT discovery", "Baseline"]
)
return combined.astype({"significant": bool})
@@ -172,26 +218,58 @@ def get_pvals(results, method_mapping=None, phenotype_mapping={}):
return pvals
+def min_Bonferroni_aggregate(pvals):
+ pval = min(pvals * len(pvals))
+ return pval
+
+
+def aggregate_pvals_per_gene(df, agg_method):
+ grouping_cols = [
+ "phenotype",
+ "gene",
+ "experiment",
+ "experiment_group",
+ "repeat_combi",
+ "correction_method",
+ ]
+ grouping_cols = list(set(grouping_cols).intersection(set(df.columns)))
+ select_cols = grouping_cols + ["pval"]
+ agg_results = df.copy()[select_cols]
+ print(f"aggregating pvalues using grouping cols {grouping_cols}")
+ agg_results = agg_results.groupby(grouping_cols, dropna=False)
+ if agg_method == "Bonferroni":
+ print("using Bonferroni")
+ agg_results = agg_results.agg(min_Bonferroni_aggregate).reset_index()
+ elif agg_method == "cct":
+ print("using cct")
+ agg_results = agg_results.agg(cct).reset_index()
+ else:
+ raise ValueError(f"Unknown agg_method type: {agg_method}. ")
+ return agg_results
+
+
def process_results(
results: pd.DataFrame,
- n_repeats: int = 6,
alpha: float = 0.05,
- correction_method: str = "FDR",
+ correction_method: str = "Bonferroni",
+ combine_pval: str = "Bonferroni",
) -> Tuple[pd.DataFrame, pd.DataFrame]:
- deeprvat_results = results.query(
- f'experiment == "DeepRVAT ({n_repeats} repeats)"'
- ' and experiment_group == "DeepRVAT"'
- )
- baseline_results = results.query(
- "experiment_group in @BASELINE_GROUPS"
- " and correction_method == @correction_method"
- )
+
+ # TODO change this query!
+ deeprvat_results = results.query('experiment_group == "DeepRVAT"')
+
+ assert (deeprvat_results.groupby("gene").size() == 1).all()
+ baseline_results = results.query("experiment_group in @BASELINE_GROUPS")
+ if "correction_method" in baseline_results.columns:
+ # if use_baseline_results is not True the correction_method column is not in results
+ baseline_results = results.query("correction_method == @correction_method")
combined_results = combine_results(
deeprvat_results,
baseline_results,
correction_method=correction_method,
alpha=alpha,
+ combine_pval=combine_pval,
)
all_pvals = get_pvals(combined_results, method_mapping=METHOD_NAMES)
@@ -209,44 +287,29 @@ def process_results(
def evaluate_(
associations: pd.DataFrame,
alpha: float,
- seed_genes: Optional[pd.DataFrame],
- repeats: Optional[int] = None,
baseline_results: Optional[pd.DataFrame] = None,
debug: bool = False,
- correction_method: str = "FDR",
+ correction_method: str = "Bonferroni",
+ combine_pval: str = "Bonferroni",
):
- if seed_genes is not None:
- seed_gene_ids = seed_genes["id"]
- associations = associations.query("gene not in @seed_gene_ids")
-
- n_total_repeats = (
- repeats
- if repeats is not None
- else associations["model"]
- .str.split("_")
- .apply(lambda x: x[-1])
- .astype(int)
- .max()
- + 1
- )
- if debug:
- n_total_repeats = min(n_total_repeats, 2)
logger.info("Evaluation results:")
results = pd.DataFrame()
- for n_repeats in range(1, n_total_repeats + 1):
- rep_str = f"{n_repeats} repeats"
- repeat_mask = (
- associations["model"].str.split("_").apply(lambda x: x[-1]).astype(int)
- < n_repeats
- )
- this_result = associations[repeat_mask].copy()
-
- experiment_name = f"DeepRVAT ({n_repeats} repeats)"
- this_result["experiment"] = experiment_name
+ # TODO change this!
+ n_repeats = (
+ 1 # TODO maybe completely drop this (we don't need any filtering any more
+ )
+ # we just use the entire data frame)
+ rep_str = f"{results} repeats"
+ repeat_mask = (
+ associations["model"].str.split("_").apply(lambda x: x[-1]).astype(int)
+ < n_repeats
+ )
+ results = associations[repeat_mask].copy()
- results = pd.concat([results, this_result])
+ results["experiment"] = "DeepRVAT"
+ ########### change until here ##################
results["-log10pval"] = -np.log10(results["pval"])
results["experiment_group"] = "DeepRVAT"
@@ -254,11 +317,10 @@ def evaluate_(
significant, all_pvalues = process_results(
results,
- n_repeats=n_total_repeats,
alpha=alpha,
correction_method=correction_method,
+ combine_pval=combine_pval,
)
-
return significant, all_pvalues
@@ -266,28 +328,31 @@ def evaluate_(
@click.command()
@click.option("--debug", is_flag=True)
@click.option("--phenotype", type=str)
-@click.option("--use-seed-genes", is_flag=True)
-@click.option("--correction-method", type=str, default="FDR")
-@click.option("--n-repeats", type=int)
+@click.option("--use-baseline-results", is_flag=True)
+@click.option("--correction-method", type=str, default="Bonferroni")
+@click.option(
+ "--combine-pval", type=str, default="Bonferroni"
+) # Bonferroni min pval per gene for multiple baseline tests
@click.argument("association-files", type=click.Path(exists=True), nargs=-1)
@click.argument("config-file", type=click.Path(exists=True))
@click.argument("out-dir", type=click.Path())
def evaluate(
debug: bool,
phenotype: Optional[str],
- use_seed_genes: bool,
+ use_baseline_results: bool,
correction_method: str,
- n_repeats: Optional[int],
association_files: Tuple[str],
config_file: str,
out_dir: str,
+ combine_pval,
):
+
with open(config_file) as f:
config = yaml.safe_load(f)
-
associations = pd.concat(
[pd.read_parquet(f, engine="pyarrow") for f in association_files]
)
+ logger.info("Associations loaded")
pheno = (
phenotype
if phenotype is not None
@@ -297,16 +362,8 @@ def evaluate(
alpha = config["alpha"]
- repeats = n_repeats if n_repeats is not None else config["n_repeats"]
-
- seed_genes = (
- pd.read_parquet(config["seed_gene_file"], engine="pyarrow")
- if (use_seed_genes and "seed_gene_file" in config)
- else None
- )
-
- if use_seed_genes:
- logger.info("Reading seed gene discovery results")
+ if use_baseline_results:
+ logger.info("Reading baseline results")
deeprvat_genes = associations["gene"].unique()
baseline_results = get_baseline_results(
config,
@@ -317,17 +374,16 @@ def evaluate(
)
else:
baseline_results = pd.DataFrame()
-
significant, all_pvals = evaluate_(
associations,
alpha,
- seed_genes,
- repeats=repeats,
baseline_results=baseline_results,
correction_method=correction_method,
debug=debug,
+ combine_pval=combine_pval,
)
-
+ logger.info("DeepRVAT discvoeries:")
+ logger.info(significant.query('Method == "DeepRVAT"'))
logger.info("Saving results")
out_path = Path(out_dir)
significant.to_parquet(out_path / f"significant.parquet", engine="pyarrow")
diff --git a/deeprvat/deeprvat/train.py b/deeprvat/deeprvat/train.py
index ecbdef65..d730e03d 100644
--- a/deeprvat/deeprvat/train.py
+++ b/deeprvat/deeprvat/train.py
@@ -202,11 +202,9 @@ def make_dataset_(
logger.info(f"Using {n_samples} samples for training and validation")
ds = Subset(ds, range(n_samples))
-
dl = DataLoader(
ds, collate_fn=collate_fn, **config["training_data"]["dataloader_config"]
)
-
logger.info(" Generating dataset")
batches = [
batch
diff --git a/deeprvat/seed_gene_discovery/config.yaml b/deeprvat/seed_gene_discovery/config.yaml
index f4a5eba0..a90e0a2e 100644
--- a/deeprvat/seed_gene_discovery/config.yaml
+++ b/deeprvat/seed_gene_discovery/config.yaml
@@ -2,10 +2,10 @@ phenotypes:
- Apolipoprotein_A
# - Apolipoprotein_B
# - Calcium
-# - Cholesterol
+# - Cholesterol_statin_corrected
# - HDL_cholesterol
# - IGF_1
-# - LDL_direct
+# - LDL_direct_statin_corrected
# - SHBG
# - Total_bilirubin
# - Triglycerides
@@ -69,6 +69,8 @@ data:
MAF: 0.001 #is updated automatically when updating the config with update_config
x_phenotypes:
- age
+ - age2
+ - age_sex
- genetic_sex
- genetic_PC_1
- genetic_PC_2
diff --git a/deeprvat/seed_gene_discovery/seed_gene_discovery.py b/deeprvat/seed_gene_discovery/seed_gene_discovery.py
index cc7e69b5..6a187a7e 100644
--- a/deeprvat/seed_gene_discovery/seed_gene_discovery.py
+++ b/deeprvat/seed_gene_discovery/seed_gene_discovery.py
@@ -692,15 +692,15 @@ def run_association(
logger.info("Grouping variants by gene")
exploded_annotations = (
dataset.annotation_df.query("id in @all_variants")
- .explode("gene_ids")
+ .explode("gene_id")
.reset_index()
.drop_duplicates()
.set_index("id")
)
- grouped_annotations = exploded_annotations.groupby("gene_ids")
+ grouped_annotations = exploded_annotations.groupby("gene_id")
gene_ids = pd.read_parquet(dataset.gene_file, columns=["id"])["id"].to_list()
gene_ids = list(
- set(gene_ids).intersection(set(exploded_annotations["gene_ids"].unique()))
+ set(gene_ids).intersection(set(exploded_annotations["gene_id"].unique()))
)
logger.info(f"Number of genes to test: {len(gene_ids)}")
diff --git a/deeprvat/utils.py b/deeprvat/utils.py
index 3ecad145..a9c18801 100644
--- a/deeprvat/utils.py
+++ b/deeprvat/utils.py
@@ -5,7 +5,8 @@
import shutil
import sys
import pickle
-from typing import Any, Callable, Dict, Iterable
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, Union
import optuna
import numpy as np
@@ -166,7 +167,10 @@ def my_quantile_transform(x, seed=1):
"nan" values are kept
"""
np.random.seed(seed)
- x_transform = x.copy().to_numpy()
+ x_transform = x.copy()
+ if isinstance(x_transform, pd.Series):
+ x_transform = x_transform.to_numpy()
+
is_nan = np.isnan(x_transform)
n_quantiles = np.sum(~is_nan)
diff --git a/deeprvat_env.yaml b/deeprvat_env.yaml
index 778713fe..b8b26123 100644
--- a/deeprvat_env.yaml
+++ b/deeprvat_env.yaml
@@ -16,11 +16,13 @@ dependencies:
- optuna=2.10
- pandas=1.5
- pyarrow=11.0
+ - pyranges=0.0.129
- python=3.8
- pytorch=1.13
- pytorch-cuda=11
- pytorch-lightning=1.5
- pyyaml=5.4
+ - regenie=3.4.1
- scikit-learn=1.1
- scipy=1.10
- setuptools=59.5
@@ -35,3 +37,4 @@ dependencies:
- plotnine=0.10.1
- pip:
- git+https://github.com/HealthML/seak@v0.4.3
+ - bgen==1.6.3
diff --git a/deeprvat_env_no_gpu.yml b/deeprvat_env_no_gpu.yml
index b98d1cfb..2322a176 100644
--- a/deeprvat_env_no_gpu.yml
+++ b/deeprvat_env_no_gpu.yml
@@ -9,15 +9,18 @@ dependencies:
- dask=2023.5
- fastparquet=0.5
- h5py=3.1
+ - mkl!=2024.1.0
- numcodecs=0.11
- numpy=1.21
- optuna=2.10
- pandas=1.5
- pyarrow=11.0
+ - pyranges=0.0.129
- python=3.8
- pytorch=1.13
- pytorch-lightning=1.5
- pyyaml=5.4
+ - regenie=3.4.1
- scikit-learn=1.1
- scipy=1.10
- setuptools=59.5
@@ -32,3 +35,4 @@ dependencies:
- plotnine=0.10.1
- pip:
- git+https://github.com/HealthML/seak@v0.4.3
+ - bgen==1.6.3
diff --git a/docs/_static/annotation_rulegraph.svg b/docs/_static/annotation_rulegraph.svg
new file mode 100644
index 00000000..7366a07c
--- /dev/null
+++ b/docs/_static/annotation_rulegraph.svg
@@ -0,0 +1,263 @@
+
+
+
+
+
diff --git a/docs/annotations.md b/docs/annotations.md
index a9532220..94bd9ff7 100644
--- a/docs/annotations.md
+++ b/docs/annotations.md
@@ -2,15 +2,21 @@
This pipeline is based on [snakemake](https://snakemake.readthedocs.io/en/stable/). It uses [bcftools + samstools](https://www.htslib.org/), as well as [perl](https://www.perl.org/), [deepRiPe](https://ohlerlab.mdc-berlin.de/software/DeepRiPe_140/) and [deepSEA](http://deepsea.princeton.edu/) as well as [VEP](http://www.ensembl.org/info/docs/tools/vep/index.html), including plugins for [primateAI](https://github.com/Illumina/PrimateAI) and [spliceAI](https://github.com/Illumina/SpliceAI). DeepRiPe annotations were acquired using [faatpipe repository by HealthML](https://github.com/HealthML/faatpipe)[[1]](#reference-1-target) and DeepSea annotations were calculated using [kipoi-veff2](https://github.com/kipoi/kipoi-veff2)[[2]](#reference-2-target), abSplice scores were computet using [abSplice](https://github.com/gagneurlab/absplice/)[[3]](#reference-3-target)
-![dag](_static/annotation_pipeline_dag.png)
-*Figure 1: Example DAG of annoation pipeline using only two bcf files as input.*
+![dag](_static/annotation_rulegraph.svg)
+
+*Figure 1: Rulegraph of the annoation pipeline.*
+
+## Output
+This pipeline outputs a parquet file including all annotations as well as a file containing IDs to all protein coding genes needed to run DeepRVAT.
+Besides This the pipeline outputs a PCA transformation matrix for deepSEA as well as means and standard deviations used to standardize deepSEA scores before PCA analysis. This is helpfull to recreate results using a different dataset.
+Furthermore, the pipeline outputs one annotation file for VEP, CADD, DeepRiPe, DeepSea and Absplice for each input vcf-file. The tool then creates concatenates the files, performes PCA on the deepSEA scores and merges the result into a single file.
## Input
-The pipeline uses left-normalized bcf files containing variant information, a reference fasta file as well as a text file that maps data blocks to chromosomes as input. It is expected that the bcf files contain the columns "CHROM" "POS" "ID" "REF" and "ALT". Any other columns, including genotype information are stripped from the data before annotation tools are used on the data. The variants may be split into several vcf files for each chromosome and each "block" of data. The filenames should then contain the corresponding chromosome and block number. The pattern of the file names, as well as file structure may be specified in the corresponding [config file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml).
+The pipeline uses left-normalized bcf files containing variant information, a reference fasta file as well as a text file that maps data blocks to chromosomes as input. It is expected that the bcf files contain the columns "CHROM" "POS" "ID" "REF" and "ALT". Any other columns, including genotype information are stripped from the data before annotation tools are used on the data. The variants may be split into several vcf files for each chromosome and each "block" of data. The filenames should then contain the corresponding chromosome and block number. The pattern of the file names, as well as file structure may be specified in the corresponding [config file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml). The pipeline also requires input data and repositories descried in [requirements](#requirements).
-(requirements-target)=
-## Requirements
+(requirements)=
+## Requirements
BCFtools as well as HTSlib should be installed on the machine,
- [CADD](https://github.com/kircherlab/CADD-scripts/tree/master/src/scripts) as well as
@@ -20,43 +26,39 @@ BCFtools as well as HTSlib should be installed on the machine,
- [faatpipe](https://github.com/HealthML/faatpipe), and the
- [vep-plugins repository](https://github.com/Ensembl/VEP_plugins/)
-will be installed by the pipeline together with the [plugins](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html) for primateAI and spliceAI. Annotation data for CADD, spliceAI and primateAI should be downloaded. The path to the data may be specified in the corresponding [config file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml).
-Download path:
+should be installed for runnning the pipeline, together with the [plugins](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html) for primateAI and spliceAI. Annotation data for CADD, spliceAI and primateAI should be downloaded. The path to the data may be specified in the corresponding [config file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml).
+Download paths:
- [CADD](https://cadd.bihealth.org/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their Tabix Indices
- [SpliceAI](https://basespace.illumina.com/s/otSPW8hnhaZR): "genome_scores_v1.3"/"spliceai_scores.raw.snv.hg38.vcf.gz" and "spliceai_scores.raw.indel.hg38.vcf.gz"
- [PrimateAI](https://basespace.illumina.com/s/yYGFdGih1rXL) PrimateAI supplementary data/"PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz"
+- [AlphaMissense](https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz)
+Also a reference GTF file containing transcript annotaions should be provided, this can be downloaded from [here](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz)
-## Output
-
-The pipeline outputs one annotation file for VEP, CADD, DeepRiPe, DeepSea and Absplice for each input vcf-file. The tool further creates concatenated files for each tool and one merged file containing Scores from AbSplice, VEP incl. CADD, primateAI and spliceAI as well as principal components from DeepSea and DeepRiPe.
-
## Configure the annotation pipeline
The snakemake annotation pipeline is configured using a yaml file with the format akin to the [example file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml).
The config above would use the following directory structure:
```shell
-
-|-- reference
+|--reference
| |-- fasta file
-
-
-|-- metadata
-| |-- pvcf_blocks.txt
+| |-- GTF file
|-- preprocessing_workdir
-| |--reference
-| | |-- fasta file
| |-- norm
| | |-- bcf
| | | |-- bcf_input_files
| | | |-- ...
| | |-- variants
| | | |-- variants.tsv.gz
+| |-- preprocessed
+| | |-- genotypes.h5
+
|-- output_dir
| |-- annotations
| | |-- tmp
+| | | |-- deepSEA_PCA
|-- repo_dir
| |-- ensembl-vep
@@ -73,21 +75,20 @@ The config above would use the following directory structure:
+
```
-Bcf files created by the [preprocessing pipeline](preprocessing.md) are used as input data.
-The pipeline also uses the variant.tsv file as well as the reference file from the preprocesing pipeline.
+
+Bcf files created by the [preprocessing pipeline](https://deeprvat.readthedocs.io/en/latest/preprocessing.html) are used as input data. The input data directory should only contain the files needed.
+The pipeline also uses the variant.tsv file, the reference file and the genotypes file from the preprocesing pipeline.
+A GTF file as described in [requirements](#requirements) and the FASTA file used for preprocessing is also necessary.
The pipeline beginns by installing the repositories needed for the annotations, it will automatically install all repositories in the `repo_dir` folder that can be specified in the config file relative to the annotation working directory.
The text file mapping blocks to chromosomes is stored in `metadata` folder. The output is stored in the `output_dir/annotations` folder and any temporary files in the `tmp` subfolder. All repositories used including VEP with its corresponding cache as well as plugins are stored in `repo_dir/ensempl-vep`.
Data for VEP plugins and the CADD cache are stored in `annotation data`.
## Running the annotation pipeline
### Preconfiguration
-- Inside the annotation directory create a directory `repo_dir` and run the [annotation setup script](https://github.com/PMBio/deeprvat/blob/main/deeprvat/annotations/setup_annotation_workflow.sh)
- ```shell
- setup_annotation_workflow.sh repo_dir/ensembl-vep/cache repo_dir/ensembl-vep/Plugins repo_dir
- ```
- or manually clone the repositories mentioned in the [requirements](#requirements-target) into `repo_dir` and install the needed conda environments with
+- Clone the repositories mentioned in [requirements](#requirements) into `repo_dir` and install the needed conda environments with
```shell
mamba env create -f repo_dir/absplice/environment.yaml
mamba env create -f repo_dir/kipoi-veff2/environment.minimal.linux.yml
@@ -96,22 +97,18 @@ Data for VEP plugins and the CADD cache are stored in `annotation data`.
If you already have some of the needed repositories on your machine you can edit the paths in the [config](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml).
-- Inside the annotation directory create a directory `annotation_dir` and download/link the prescored files for CADD, SpliceAI, and PrimateAI (see [requirements](#requirements-target))
+- Inside the annotation directory create a directory `annotation_dir` and download/link the prescored files for CADD, SpliceAI, and PrimateAI (see [requirements](#requirements))
### Running the pipeline
+This pipeline should be run after running the [preprocessing pipeline](https://deeprvat.readthedocs.io/en/latest/preprocessing.html), since it relies on some of its outpur files (specifically the bcf files in `norm/bcf/`, the variant files in `norm/variants/` and the genotype file `preprocessed/genotypes.h5`
+
After configuration and activating the `deeprvat_annotations` environment run the pipeline using snakemake:
```shell
snakemake -j -s annotations.snakemake --configfile config/deeprvat_annotation.config --use-conda
```
-## Running the annotation pipeline without the preprocessing pipeline
-It is possible to run the annotation pipeline without having run the preprocessing prior to that.
-However, the annotation pipeline requires some files from this pipeline that then have to be created manually.
-- Left normalized bcf files from the input. These files do not have to contain any genotype information. "chrom, "pos", "ref" and "alt" columns will suffice.
-- a reference fasta file will have to be provided
-- A tab separated file containing all input variants "chrom, "pos", "ref" and "alt" entries each with a unique id.
## References
diff --git a/docs/preprocessing.md b/docs/preprocessing.md
index 8d79c876..b9564cf1 100644
--- a/docs/preprocessing.md
+++ b/docs/preprocessing.md
@@ -11,7 +11,7 @@ The important files that this pipeline produces that are needed in DeepRVAT are:
- **preprocessed/genotypes.h5** *The main sparse hdf5 file*
-- **norm/variants/variants.parquet** *List of variants i parquet format*
+- **norm/variants/variants.parquet** *List of variants in parquet format*
## Setup environment
@@ -226,4 +226,4 @@ After configuration and activating the environment run the pipeline using snakem
```shell
snakemake -j --configfile config/deeprvat_preprocess_config.yaml -s preprocess_no_qc.snakefile
-```
\ No newline at end of file
+```
diff --git a/example/annotations.parquet b/example/annotations.parquet
index f651e593..35f6ea5e 100644
Binary files a/example/annotations.parquet and b/example/annotations.parquet differ
diff --git a/example/annotations/preprocessing_workdir/preprocessed/genotypes.h5 b/example/annotations/preprocessing_workdir/preprocessed/genotypes.h5
new file mode 100644
index 00000000..e69de29b
diff --git a/example/annotations/reference/gencode.v44.annotation.gtf.gz b/example/annotations/reference/gencode.v44.annotation.gtf.gz
new file mode 100644
index 00000000..e69de29b
diff --git a/example/config.yaml b/example/config.yaml
index cd8119c0..c6a6a3fb 100644
--- a/example/config.yaml
+++ b/example/config.yaml
@@ -26,6 +26,7 @@ alpha: 0.05
n_burden_chunks: 2
n_regression_chunks: 2
+n_avg_chunks: 1
n_repeats: 2
@@ -98,6 +99,8 @@ training_data:
x_phenotypes:
- age
- genetic_sex
+ - age2
+ - age_sex
- genetic_PC_1
- genetic_PC_2
- genetic_PC_3
@@ -215,6 +218,8 @@ data:
x_phenotypes:
- age
- genetic_sex
+ - age2
+ - age_sex
- genetic_PC_1
- genetic_PC_2
- genetic_PC_3
diff --git a/example/genotypes.h5 b/example/genotypes.h5
index ae469d69..9ebb397b 100644
Binary files a/example/genotypes.h5 and b/example/genotypes.h5 differ
diff --git a/example/phenotypes.parquet b/example/phenotypes.parquet
index 1ab4b2db..240a7ce4 100644
Binary files a/example/phenotypes.parquet and b/example/phenotypes.parquet differ
diff --git a/lsf/config.yaml b/lsf/config.yaml
index d7bf68fd..e32c1617 100644
--- a/lsf/config.yaml
+++ b/lsf/config.yaml
@@ -1,132 +1,127 @@
phenotypes:
Apolipoprotein_A:
- correction_method: FDR
+<<<<<<< HEAD
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Apolipoprotein_A
Apolipoprotein_B:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Apolipoprotein_B
Calcium:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Calcium
- Cholesterol:
- correction_method: FDR
+ Cholesterol_statin_corrected:
+ correction_method: Bonferroni
n_training_genes: 40
- baseline_phenotype: Cholesterol
+ baseline_phenotype: Cholesterol_statin_corrected
Red_blood_cell_erythrocyte_count:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Red_blood_cell_erythrocyte_count
HDL_cholesterol:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: HDL_cholesterol
IGF_1:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: IGF_1
- LDL_direct:
- correction_method: FDR
+ LDL_direct_statin_corrected:
+ correction_method: Bonferroni
n_training_genes: 40
- baseline_phenotype: LDL_direct
+ baseline_phenotype: LDL_direct_statin_corrected
Lymphocyte_percentage:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Lymphocyte_percentage
Mean_platelet_thrombocyte_volume:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Mean_platelet_thrombocyte_volume
Mean_corpuscular_volume:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Mean_corpuscular_volume
Mean_reticulocyte_volume:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Mean_reticulocyte_volume
Neutrophill_count:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Neutrophill_count
Platelet_count:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Platelet_count
Platelet_crit:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Platelet_crit
Platelet_distribution_width:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Platelet_distribution_width
SHBG:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: SHBG
Standing_height:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Standing_height
Total_bilirubin:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Total_bilirubin
Triglycerides:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Triglycerides
Urate:
- correction_method: FDR
+ correction_method: Bonferroni
n_training_genes: 40
baseline_phenotype: Urate
Body_mass_index_BMI:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Body_mass_index_BMI
Glucose:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Glucose
Vitamin_D:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Vitamin_D
Albumin:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Albumin
Total_protein:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Total_protein
Cystatin_C:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Cystatin_C
Gamma_glutamyltransferase:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Gamma_glutamyltransferase
Alkaline_phosphatase:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Alkaline_phosphatase
Creatinine:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Creatinine
Whole_body_fat_free_mass:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Whole_body_fat_free_mass
Forced_expiratory_volume_in_1_second_FEV1:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Forced_expiratory_volume_in_1_second_FEV1
- QTC_interval:
- correction_method: FDR
- baseline_phenotype: QTC_interval
Glycated_haemoglobin_HbA1c:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: Glycated_haemoglobin_HbA1c
- WHR:
- correction_method: FDR
- baseline_phenotype: WHR
WHR_Body_mass_index_BMI_corrected:
- correction_method: FDR
+ correction_method: Bonferroni
baseline_phenotype: WHR_Body_mass_index_BMI_corrected
baseline_results:
@@ -153,7 +148,7 @@ n_repeats: 6
do_scoretest: True
training:
- min_variant_count: 1
+ min_variant_count: 0
n_bags: 1
drop_n_bags: 0
train_proportion: 0.8
@@ -169,11 +164,14 @@ training:
- Apolipoprotein_A
- Apolipoprotein_B
- Calcium
- - Cholesterol
+ - Cholesterol_statin_corrected
+ - Red_blood_cell_erythrocyte_count
+ - HDL_cholesterol
+ - IGF_1
+ - LDL_direct_statin_corrected
- Red_blood_cell_erythrocyte_count
- HDL_cholesterol
- IGF_1
- - LDL_direct
- Lymphocyte_percentage
- Mean_platelet_thrombocyte_volume
- Mean_corpuscular_volume
@@ -245,6 +243,8 @@ training_data:
y_transformation: quantile_transform
x_phenotypes:
- age
+ - age2
+ - age_sex
- genetic_sex
- genetic_PC_1
- genetic_PC_2
@@ -303,6 +303,7 @@ training_data:
- DeepRipe_plus_MBNL1_parclip
- DeepRipe_plus_QKI_parclip
- SpliceAI_delta_score
+ - alphamissense
use_common_variants: False
use_rare_variants: True
rare_embedding:
@@ -342,6 +343,7 @@ training_data:
- DeepRipe_plus_MBNL1_parclip
- DeepRipe_plus_QKI_parclip
- SpliceAI_delta_score
+ - alphamissense
thresholds:
combined_UKB_NFE_AF: "combined_UKB_NFE_AF < 1e-2"
CADD_PHRED: "CADD_PHRED > 5"
@@ -362,6 +364,8 @@ data:
y_transformation: quantile_transform
x_phenotypes:
- age
+ - age2
+ - age_sex
- genetic_sex
- genetic_PC_1
- genetic_PC_2
@@ -420,6 +424,7 @@ data:
- DeepRipe_plus_MBNL1_parclip
- DeepRipe_plus_QKI_parclip
- SpliceAI_delta_score
+ - alphamissense
gene_file: protein_coding_genes.parquet
use_common_variants: False
use_rare_variants: True
@@ -460,6 +465,7 @@ data:
- DeepRipe_plus_MBNL1_parclip
- DeepRipe_plus_QKI_parclip
- SpliceAI_delta_score
+ - alphamissense
thresholds:
combined_UKB_NFE_AF: "combined_UKB_NFE_AF < 1e-3"
CADD_PHRED: "CADD_PHRED > 5"
diff --git a/lsf/lsf.yaml b/lsf/lsf.yaml
index cc972219..407d2cba 100644
--- a/lsf/lsf.yaml
+++ b/lsf/lsf.yaml
@@ -1,6 +1,6 @@
__default__:
- "-q medium"
- - "-R \"select[(hname != 'odcf-cn11u15' && hname != 'odcf-cn31u13' && hname != 'odcf-cn31u21' && hname != 'odcf-cn23u23')]\""
+ - "-R \"select[(hname != 'odcf-cn11u15' && hname != 'odcf-cn11u17' && hname != 'odcf-cn33u24s03' && hname != 'odcf-cn23u25' && hname != 'odcf-cn11u13' && hname != 'odcf-cn31u13' && hname != 'odcf-cn31u21' && hname != 'odcf-cn23u23')]\""
# For association testing pipelines
@@ -51,6 +51,21 @@ regress:
combine_regression_chunks:
- "-q short"
+regenie_step1_splitl0:
+ - "-q short"
+
+regenie_step1_runl0:
+ - "-q medium"
+
+regenie_step1_runl1:
+ - "-q medium"
+
+regenie_step1:
+ - "-q verylong"
+
+regenie_step2:
+ - "-q medium"
+
# For CV (phenotype prediction) pipeline
@@ -69,20 +84,19 @@ deeprvat_delete_burden_cache:
deeprvat_best_cv_run:
- "-q short"
-deeprvat_train_cv:
- - "-q gpu-lowprio"
- - "-gpu num=1:j_exclusive=yes:mode=exclusive_process:gmem=10.7G"
- - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e071-gpu06')]\""
- # - "-R tensorcore"
- # - "-L /bin/bash"
-
-deeprvat_train_bagging:
- - "-q gpu-lowprio"
- - "-gpu num=1:j_exclusive=yes:mode=exclusive_process:gmem=10.7G"
+deeprvat_train:
+ - "-q gpu"
+ - "-gpu num=1:gmem=10.7G"
- "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\""
# - "-R tensorcore"
# - "-L /bin/bash"
+
+deeprvat_compute_burdens:
+ - "-q gpu"
+ - "-gpu num=1:j_exclusive=yes:mode=exclusive_process:gmem=15.7G"
+
+
deeprvat_compute_burdens:
- "-q gpu-lowprio"
- "-gpu num=1:j_exclusive=yes:mode=exclusive_process:gmem=10.7G"
@@ -91,6 +105,30 @@ deeprvat_compute_burdens:
# - "-R tensorcore"
# - "-L /bin/bash"
+prepare_genotypes_per_gene:
+ - "-q long"
+deeprvat_regress:
+ - "-q long"
+
+average_burdens:
+ - "-q long"
+
+regress_missense:
+ - "-q long"
+regress_plof:
+ - "-q long"
+
+seed_gene_regress_missense:
+ - "-q long"
+
+seed_gene_association_dataset:
+ - "-q long"
+association_dataset:
+ - "-q long"
+
+seed_gene_regress_plof:
+ - "-q medium"
+
deeprvat_compute_plof_burdens:
- "-q medium"
diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
index 29991c48..a97f38ed 100644
--- a/pipelines/annotations.snakefile
+++ b/pipelines/annotations.snakefile
@@ -1,9 +1,37 @@
import pandas as pd
from pathlib import Path
-
+from glob import glob
+from itertools import chain
+import re
+import os
+import yaml
configfile: "config/deeprvat_annotation_config.yaml"
-
+## helper functions
+def tryint(s):
+ """
+ Return an int if possible, or `s` unchanged.
+ """
+ try:
+ return int(s)
+ except ValueError:
+ return s
+
+def alphanum_key(s):
+ """
+ Turn a string into a list of string and number chunks.
+
+ >>> alphanum_key("z23a")
+ ["z", 23, "a"]
+
+ """
+ return [ tryint(c) for c in re.split('([0-9]+)', s) ]
+
+def human_sort(l):
+ """
+ Sort a list by regarding int values inside strings as numbers
+ """
+ l.sort(key=alphanum_key)
# init general
@@ -11,22 +39,25 @@ species = config.get("species") or "homo_sapiens"
genome_assembly = config.get("genome_assembly") or "GRCh38"
fasta_dir = Path(config["fasta_dir"])
fasta_file_name = config["fasta_file_name"]
+gtf_file = fasta_dir / config['gtf_file_name']
+gene_id_file = config.get('gene_id_parquet')
+
deeprvat_parent_path = Path(config["deeprvat_repo_dir"])
annotation_python_file = (
deeprvat_parent_path / "deeprvat" / "annotations" / "annotations.py"
)
-setup_shell_path = (
- deeprvat_parent_path / "deeprvat" / "annotations" / "setup_annotation_workflow.sh"
-)
+annotation_columns_yaml_file = config.get('annotation_columns_yaml_file') or deeprvat_parent_path/'pipelines'/'config'/'annotation_colnames_filling_values.yaml'
included_chromosomes = config.get(
"included_chromosomes", [f"{c}" for c in range(1, 23)] + ["X", "Y"]
)
-variant_file = config["variant_file_path"]
-pybedtools_tmp_path = Path(config["pybedtools_tmp_path"])
+
+preprocess_dir = Path(config.get("preprocessing_workdir", ""))
+variant_file = config.get("variant_file_path") or preprocess_dir / 'norm' / 'variants' / 'variants.tsv.gz'
+genotype_file = config.get("genotype_file_path") or preprocess_dir / 'preprocessed' / 'genotypes.h5'
saved_deepripe_models_path = (
Path(config["faatpipe_repo_dir"]) / "data" / "deepripe_models"
)
-merge_nthreads = int(config.get("merge_nthreads") or 64)
+merge_nthreads = int(config.get("merge_nthreads") or 8)
# If modules are used we load them here
load_bfc = f'{config["bcftools_load_cmd"]} &&' if config.get("bcftools_load_cmd") else ""
@@ -37,16 +68,13 @@ load_vep = f'{config["vep_load_cmd"]} &&' if config.get("vep_load_cmd") else ""
# init data path
source_variant_file_pattern = config["source_variant_file_pattern"]
+source_variant_file_type = config["source_variant_file_type"]
source_variant_dir = Path(config["source_variant_dir"])
anno_tmp_dir = Path(config["anno_tmp_dir"])
anno_dir = Path(config["anno_dir"])
-metadata_dir = Path(config["metadata_dir"])
-vep_plugin_repo = Path(config["vep_plugin_repo"])
-condel_config_path = vep_plugin_repo / "config" / "Condel" / "config"
+pybedtools_tmp_path = Path(config.get("pybedtools_tmp_path" , anno_tmp_dir / 'pybedtools'))
+
-# init cadd PLugin
-cadd_snv_file = config["cadd_snv_file"]
-cadd_indel_file = config["cadd_indel_file"]
# init vep
vep_source_dir = Path(config["vep_repo_dir"])
@@ -55,87 +83,239 @@ vep_plugin_dir = Path(config.get("vep_plugin_dir")) or vep_source_dir / "Plugin"
vep_input_format = config.get("vep_input_format") or "vcf"
vep_nfork = int(config.get("vep_nfork") or 5)
af_mode = config.get("af_mode") or "af"
+condel_config_path = vep_plugin_dir / "config" / "Condel" / "config"
-# init plugIns
-spliceAI_snv_file = config["spliceAI_snv_file"]
-spliceAI_indel_file = config["spliceAI_indel_file"]
-primateAIfile = config["primateAI_file"]
-pvcf_blocks_df = pd.read_csv(
- metadata_dir / config["pvcf_blocks_file"],
- sep="\t",
- header=None,
- names=["Index", "Chromosome", "Block", "First position", "Last position"],
- dtype={"Chromosome": str},
-).set_index("Index")
+#init deepSEA
+deepSEA_tmp_dir = config.get('deepSEA_tmp_dir')or anno_tmp_dir / 'deepSEA_PCA'
+deepSEA_pca_obj = config.get('deepSEA_pca_object')or anno_tmp_dir / 'deepSEA_PCA' / 'pca.npy'
+deepSEA_means_and_sds = config.get('deepSEA_means_and_sds')or anno_tmp_dir / 'deepSEA_PCA' / 'deepSEA_means_SDs.parquet'
+n_pca_components = config.get('deepsea_pca_n_components', 100)
-# init absplice
-absplice_repo_dir = Path(config["absplice_repo_dir"])
-n_cores_absplice = int(config.get("n_cores_absplice") or 4)
-ncores_merge_absplice = int(config.get("n_cores_merge_absplice") or 64)
# init deepripe
n_jobs_deepripe = int(config.get("n_jobs_deepripe") or 8)
+
# init kipoi-veff2
kipoi_repo_dir = Path(config["kipoiveff_repo_dir"])
ncores_addis = int(config.get("n_jobs_addids") or 32)
-# Filter out which chromosomes to work with
-pvcf_blocks_df = pvcf_blocks_df[
- pvcf_blocks_df["Chromosome"].isin([str(c) for c in included_chromosomes])
+
+
+# init absplice
+absplice_repo_dir = Path(config["absplice_repo_dir"])
+n_cores_absplice = int(config.get("n_cores_absplice") or 4)
+ncores_merge_absplice = int(config.get("n_cores_merge_absplice") or 8)
+ncores_agg_absplice = int(config.get("ncores_agg_absplice") or 4)
+
+source_variant_file_pattern_complete = source_variant_file_pattern+'.'+source_variant_file_type
+print(included_chromosomes)
+file_paths = [
+ glob(str(source_variant_dir / source_variant_file_pattern_complete.format(chr=c, block='*')))
+ for c in included_chromosomes
]
-chr_mapping = pd.Series(
- [str(x) for x in range(1, 23)] + ["X", "Y"], index=[str(x) for x in range(1, 25)]
-)
-inv_chr_mapping = pd.Series(
- [str(x) for x in range(1, 25)], index=[str(x) for x in range(1, 23)] + ["X", "Y"]
-)
+file_paths = list(chain.from_iterable(file_paths))
+human_sort(file_paths)
+file_stems = [Path(p).stem.split('.')[0] for p in file_paths]
+
+absplice_download_dir = config.get('absplice_download_dir') or absplice_repo_dir /'example'/'data'/'resources'/'downloaded_files'
+absplice_output_dir = config.get('absplice_output_dir', anno_tmp_dir /'absplice')
+vcf_id = anno_tmp_dir / '{vcf_id}'
+vcf_dir = anno_tmp_dir
+
+config_download_path = deeprvat_parent_path/'pipelines'/'resources'/"absplice_config_download.yaml"
+with open(config_download_path, "r") as fd:
+ config_download = yaml.safe_load(fd)
+
+config_pred_path = deeprvat_parent_path / 'pipelines'/'resources'/"absplice_config_pred.yaml"
+with open(config_pred_path, "r") as fd:
+ config_pred = yaml.safe_load(fd)
+
+config_cat_path = deeprvat_parent_path / 'pipelines'/'resources'/"absplice_config_cat.yaml"
+with open(config_cat_path, "r") as fd:
+ config_cat = yaml.safe_load(fd)
-pvcf_blocks_df["chr_name"] = chr_mapping.loc[pvcf_blocks_df["Chromosome"].values].values
+absplice_main_conf_path = deeprvat_parent_path / 'pipelines'/'resources'/"config_absplice.yaml"
+with open(absplice_main_conf_path, "r") as fd:
+ absplice_main_conf = yaml.safe_load(fd)
-chromosomes = pvcf_blocks_df["chr_name"]
-block = pvcf_blocks_df["Block"]
+
+include: Path('resources')/"absplice_download.snakefile"
+include: Path('resources')/"absplice_splicing_pred_DNA.snakefile"
+if absplice_main_conf['AbSplice_RNA'] == True:
+ include: deeprvat_parent_path / 'deeprvat' / 'pipelines'/'resources'/"absplice_splicing_pred_RNA.snakefile"
+
+all_absplice_output_files = list()
+all_absplice_output_files.append(rules.all_download.input)
+all_absplice_output_files.append(rules.all_predict_dna.input)
+
+if absplice_main_conf['AbSplice_RNA'] == True:
+ all_absplice_output_files.append(rules.all_predict_rna.input)
rule all:
input:
- anno_dir / "current_annotations_absplice.parquet",
+ anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet",
+
+rule select_rename_fill_columns:
+ input:
+ yaml_file = annotation_columns_yaml_file,
+ annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered.parquet",
+ output:
+ anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet",
+ shell:
+ " ".join([
+ f"python {annotation_python_file}",
+ "select-rename-fill-annotations",
+ "{input.yaml_file}",
+ "{input.annotations_path}",
+ "{output}"
+
+
+ ])
+
+if not gene_id_file:
+ gene_id_file = anno_tmp_dir / 'protein_coding_genes.parquet'
+ rule create_gene_id_file:
+ input: gtf_file
+ output: gene_id_file
+ shell:
+ " ".join([
+ f"python {annotation_python_file}",
+ "create-protein-id-file",
+ "{input}",
+ "{output}"
+ ])
+
+rule filter_by_exon_distance:
+ input:
+ annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs.parquet",
+ gtf_file = gtf_file,
+ protein_coding_genes = gene_id_file
+ output:
+ anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered.parquet",
+ shell:
+ " ".join([
+ f"python {annotation_python_file}",
+ "filter-annotations-by-exon-distance",
+ "{input.annotations_path}",
+ "{input.gtf_file}",
+ "{input.protein_coding_genes}",
+ "{output}"
+ ])
+
+rule add_gene_ids:
+ input:
+ gene_id_file = gene_id_file,
+ annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf.parquet",
+ output: anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs.parquet",
+ shell:
+ " ".join([
+ f"python {annotation_python_file}",
+ "add-protein-ids",
+ "{input.gene_id_file}",
+ "{input.annotations_path}",
+ "{output}"
+ ])
-rule aggregate_and_merge_absplice:
+rule calculate_MAF:
input:
- abscore_files=expand(
- [
- anno_tmp_dir
- / "absplice"
- / (source_variant_file_pattern + "_AbSplice_DNA.csv")
- ],
- zip,
- chr=chromosomes,
- block=block,
- ),
- current_annotation_file=anno_dir / "vep_deepripe_deepsea.parquet",
+ anno_dir / "vep_deepripe_deepsea_absplice_af.parquet"
output:
- annotations=anno_dir / "current_annotations_absplice.parquet",
- scores=anno_tmp_dir / "abSplice_score_file.parquet",
+ anno_dir / "vep_deepripe_deepsea_absplice_maf.parquet"
shell:
+ " ".join([
+ f"python {annotation_python_file}",
+ "calculate-maf",
+ "{input}",
+ "{output}"
+
+
+ ])
+
+
+
+rule merge_allele_frequency:
+ input:
+ allele_frequencies = anno_tmp_dir / "af_df.parquet",
+ annotation_file = anno_dir / "vep_deepripe_deepsea_absplice.parquet"
+ output:
+ anno_dir / "vep_deepripe_deepsea_absplice_af.parquet"
+ shell:
+ " ".join([
+ f"python {annotation_python_file}",
+ "merge-af",
+ "{input.annotation_file}",
+ "{input.allele_frequencies}",
+ "{output}"
+
+
+ ])
+
+
+
+
+rule calculate_allele_frequency:
+ input:
+ genotype_file = genotype_file,
+ variants = variant_file
+ output:
+ allele_frequencies = anno_tmp_dir / "af_df.parquet"
+ shell:
+ " ".join([
+ f"python {annotation_python_file}",
+ "get-af-from-gt",
+ "{input.genotype_file}",
+ "{input.variants}",
+ "{output.allele_frequencies}"
+
+
+ ])
+
+
+
+
+rule merge_absplice_scores:
+ input:
+ absplice_scores = anno_tmp_dir / "abSplice_score_file.parquet",
+ current_annotation_file= anno_dir / "vep_deepripe_deepsea.parquet",
+ output:
+ anno_dir / "vep_deepripe_deepsea_absplice.parquet"
+ threads: ncores_merge_absplice
+ shell:
" ".join(
[
"python",
f"{annotation_python_file}",
- "get-abscores",
+ "merge-abscores",
"{input.current_annotation_file}",
- str(anno_tmp_dir / "absplice"),
- "{output.annotations}",
- "{output.scores}",
- f"{ncores_merge_absplice}",
- ]
- )
+ "{input.absplice_scores}",
+ "{output}",
+ ])
+
+rule aggregate_absplice_scores:
+ input:
+ abscore_files= expand([absplice_output_dir / absplice_main_conf['genome'] / 'dna' / '{file_stem}_variants_header.vcf.gz_AbSplice_DNA.csv'],file_stem = file_stems),
+ current_annotation_file= anno_dir / "vep_deepripe_deepsea.parquet",
+ output:
+ score_file = anno_tmp_dir / "abSplice_score_file.parquet",
+ threads: ncores_agg_absplice
+ shell:
+ " ".join(
+ [
+ "python",
+ f"{annotation_python_file}",
+ "aggregate-abscores {input.current_annotation_file}",
+ str(absplice_output_dir / absplice_main_conf['genome'] / 'dna' ),
+ "{output.score_file} {threads}"
+ ])
rule merge_deepsea_pcas:
input:
- annotations=anno_dir / "vep_deepripe.parquet",
- deepsea_pcas=anno_dir / "deepSea_pca" / "deepsea_pca.parquet",
+ annotations = anno_dir / "vep_deepripe.parquet",
+ deepsea_pcas = anno_dir / "all_variants.wID.deepSea.parquet",
+ col_yaml_file = annotation_columns_yaml_file
output:
anno_dir / "vep_deepripe_deepsea.parquet",
shell:
@@ -146,6 +326,7 @@ rule merge_deepsea_pcas:
"merge-deepsea-pcas",
"{input.annotations}",
"{input.deepsea_pcas}",
+ "{input.col_yaml_file}",
"{output}",
]
)
@@ -153,27 +334,21 @@ rule merge_deepsea_pcas:
rule concat_annotations:
input:
- pvcf=metadata_dir / config["pvcf_blocks_file"],
- anno_dir=anno_dir,
vcf_files=expand(
- [anno_dir / f"{source_variant_file_pattern}_merged.parquet"],
- zip,
- chr=chromosomes,
- block=block,
+ [anno_dir / "{file_stem}_merged.parquet"],
+ file_stem = file_stems,
),
output:
anno_dir / "vep_deepripe.parquet",
+ params: joined=lambda w, input: ",".join(input.vcf_files)
shell:
" ".join(
[
"python",
str(annotation_python_file),
"concat-annotations",
- "{input.pvcf}",
- "{input.anno_dir}",
- f"{str(source_variant_file_pattern + '_merged.parquet').format(chr='{{chr}}',block='{{block}}')}",
- "{output}",
- f" --included-chromosomes {','.join(included_chromosomes)}",
+ '{params.joined}',
+ "{output}"
]
)
@@ -188,6 +363,7 @@ rule merge_annotations:
deepripe_hg2=anno_dir
/ (source_variant_file_pattern + "_variants.eclip_hg2_deepripe.csv.gz"),
variant_file=variant_file,
+ vcf_file= anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"),
output:
anno_dir / f"{source_variant_file_pattern}_merged.parquet",
shell:
@@ -196,111 +372,27 @@ rule merge_annotations:
+ "{input.vep}"
+ "| head | cut -f 1 -d ':') && python "
+ f"{annotation_python_file} "
- + "merge-annotations $(($HEADER-1)) {input.vep} {input.deepripe_parclip} {input.deepripe_hg2} {input.deepripe_k5} {input.variant_file} {output}"
+ + "merge-annotations $(($HEADER-1)) {input.vep} {input.deepripe_parclip} {input.deepripe_hg2} {input.deepripe_k5} {input.variant_file} {input.vcf_file} {output}"
)
-
-rule mv_absplice_files:
- input:
- str(
- absplice_repo_dir
- / "example"
- / "data"
- / "results"
- / "hg38"
- / (source_variant_file_pattern + "_AbSplice_DNA.csv")
- ),
- output:
- anno_tmp_dir / "absplice" / (source_variant_file_pattern + "_AbSplice_DNA.csv"),
- shell:
- " ".join(
- [
- "mkdir",
- "-p",
- str(anno_tmp_dir / "absplice"),
- "&&",
- "cp",
- "{input}",
- "{output}",
- ]
- )
-
-
-rule absplice:
- conda:
- "absplice"
- input:
- vcf=expand(
- [
- absplice_repo_dir
- / "example/data/resources/analysis_files/input_files"
- / (source_variant_file_pattern + "_variants_header.vcf.gz"),
- ],
- zip,
- chr=chromosomes,
- block=block,
- ),
- config=absplice_repo_dir / "example" / "workflow" / "mv_config.done",
- output:
- expand(
- [
- str(
- absplice_repo_dir
- / "example"
- / "data"
- / "results"
- / "hg38"
- / (source_variant_file_pattern + "_AbSplice_DNA.csv")
- ),
- ],
- zip,
- chr=chromosomes,
- block=block,
- ),
- threads: n_cores_absplice
- shell:
- (
- f"""python -m snakemake -s {str(absplice_repo_dir / "example" / "workflow" / "Snakefile")} -j 1 --use-conda --rerun-incomplete --directory {str(absplice_repo_dir / "example" / "workflow")} -c"""
- + "{threads}"
- )
-
-
-rule mod_config_absplice:
- output:
- absplice_repo_dir / "example" / "workflow" / "mv_config.done",
- shell:
- f""" rm {absplice_repo_dir}/example/workflow/config.yaml && cp {deeprvat_parent_path}/pipelines/resources/absplice_config.yaml {absplice_repo_dir}/example/workflow/config.yaml && touch {absplice_repo_dir}/example/workflow/mv_config.done"""
-
-
-rule link_files_absplice:
- input:
- anno_tmp_dir / (source_variant_file_pattern + "_variants_header.vcf.gz"),
- output:
- absplice_repo_dir
- / "example/data/resources/analysis_files/input_files"
- / (source_variant_file_pattern + "_variants_header.vcf.gz"),
- shell:
- f"mkdir -p {absplice_repo_dir / 'example/data/resources/analysis_files/input_files'} && ln -s -r {{input}} {{output}}"
-
-
rule deepSea_PCA:
input:
- anno_dir / "all_variants.wID.deepSea.csv",
+ deepsea_anno = str(anno_dir / "all_variants.deepSea.parquet")
output:
- anno_dir / "deepSea_pca" / "deepsea_pca.parquet",
+ deepSEA_tmp_dir / "deepsea_pca.parquet",
shell:
" ".join(
- [
- "mkdir -p",
- str(anno_dir / "deepSea_pca"),
- "&&",
- "python",
- f"{annotation_python_file}",
- "deepsea-pca",
- f"--n-components {config['deepsea_pca_n_components']}",
- "{input}",
- str(anno_dir / "deepSea_pca" / "pca_matrix"),
- str(anno_dir / "deepSea_pca"),
+ ["mkdir -p",
+ str(deepSEA_tmp_dir),
+ "&&",
+ "python",
+ f"{annotation_python_file}",
+ "deepsea-pca",
+ "{input.deepsea_anno}",
+ f"{str(deepSEA_pca_obj)}",
+ f"{str(deepSEA_means_and_sds)}",
+ f"{deepSEA_tmp_dir}",
+ f"--n-components {n_pca_components}"
]
)
@@ -308,16 +400,16 @@ rule deepSea_PCA:
rule add_ids_deepSea:
input:
variant_file=variant_file,
- annotation_file=anno_dir / "all_variants.deepSea.csv",
+ annotation_file=deepSEA_tmp_dir / "deepsea_pca.parquet",
output:
- anno_dir / "all_variants.wID.deepSea.csv",
+ directory(anno_dir / "all_variants.wID.deepSea.parquet"),
threads: ncores_addis
shell:
" ".join(
[
"python",
f"{annotation_python_file}",
- "add-ids",
+ "add-ids-dask",
"{input.annotation_file}",
"{input.variant_file}",
"{threads}",
@@ -328,34 +420,26 @@ rule add_ids_deepSea:
rule concat_deepSea:
input:
- expand(
+ deepSEAscoreFiles = expand(
[
anno_dir
- / (source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"),
+ / ("{file_stem}" + ".CLI.deepseapredict.diff.tsv"),
],
- zip,
- chr=chromosomes,
- block=block,
+ file_stem = file_stems
),
+ params: joined=lambda w, input: ",".join(input.deepSEAscoreFiles)
+ threads:8
output:
- anno_dir / "all_variants.deepSea.csv",
+ anno_dir / "all_variants.deepSea.parquet",
shell:
" ".join(
[
"python",
f"{annotation_python_file}",
- "concatenate-deepripe",
- "--included-chromosomes",
- ",".join(included_chromosomes),
- "--sep '\t'",
- f"{anno_dir}",
- str(
- source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"
- ).format(chr="{{chr}}", block="{{block}}"),
- str(metadata_dir / config["pvcf_blocks_file"]),
- str(
- anno_dir / "all_variants.deepSea.csv",
- ),
+ "concatenate-deepsea",
+ "{params.joined}",
+ "{output}",
+ "{threads}"
]
)
@@ -367,6 +451,7 @@ rule deepSea:
fasta=fasta_dir / fasta_file_name,
output:
anno_dir / (source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"),
+ threads: n_jobs_deepripe
conda:
"kipoi-veff2"
shell:
@@ -379,6 +464,7 @@ rule deepRiPe_parclip:
fasta=fasta_dir / fasta_file_name,
output:
anno_dir / (source_variant_file_pattern + "_variants.parclip_deepripe.csv.gz"),
+ threads: n_jobs_deepripe
shell:
f"mkdir -p {pybedtools_tmp_path / 'parclip'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path / 'parclip'} {saved_deepripe_models_path} {{threads}} 'parclip'"
@@ -442,27 +528,27 @@ rule vep:
str(vep_nfork),
"--fasta",
"{input.fasta}",
- "--everything",
"--tab",
"--total_length",
"--no_escape",
- "--xref_refseq",
+ "--polyphen s",
+ "--sift s",
+ "--canonical",
+ "--protein",
+ "--biotype",
+ "--af",
"--force_overwrite",
"--no_stats",
"--per_gene",
- "--pick_order biotype,mane_select,mane_plus_clinical,canonical,appris,tsl,ccds,rank,length,ensembl,refseq",
- f"--plugin CADD,{cadd_snv_file},{cadd_indel_file}",
- f"--plugin SpliceAI,snv={spliceAI_snv_file},indel={spliceAI_indel_file}",
- f"--plugin PrimateAI,{primateAIfile}",
- f"--plugin Condel,{condel_config_path},s,2",
- ]
+ "--pick_order biotype,mane_select,mane_plus_clinical,canonical,appris,tsl,ccds,rank,length,ensembl,refseq"
+ ]+['--plugin '+i for i in config['additional_vep_plugin_cmds'].values()]
)
rule extract_with_header:
input:
source_variant_dir
- / (source_variant_file_pattern + f".{config['source_variant_file_type']}"),
+ / (source_variant_file_pattern + f".{source_variant_file_type}"),
output:
anno_tmp_dir / (source_variant_file_pattern + "_variants_header.vcf.gz"),
shell:
@@ -473,6 +559,7 @@ rule extract_with_header:
)
+
rule strip_chr_name:
input:
anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"),
@@ -485,7 +572,7 @@ rule strip_chr_name:
rule extract_variants:
input:
source_variant_dir
- / (source_variant_file_pattern + f".{config['source_variant_file_type']}"),
+ / (source_variant_file_pattern + f".{source_variant_file_type}"),
output:
anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"),
shell:
@@ -496,4 +583,4 @@ rule extract_variants:
"'%CHROM\t%POS\t%ID\t%REF\t%ALT\n'",
"{input} > {output}",
]
- )
+ )
\ No newline at end of file
diff --git a/pipelines/association_testing/association_dataset.snakefile b/pipelines/association_testing/association_dataset.snakefile
index 0e63e53f..ad6d1870 100644
--- a/pipelines/association_testing/association_dataset.snakefile
+++ b/pipelines/association_testing/association_dataset.snakefile
@@ -1,3 +1,7 @@
+configfile: "config.yaml"
+
+debug_flag = config.get('debug', False)
+debug = '--debug ' if debug_flag else ''
rule association_dataset:
input:
@@ -5,6 +9,10 @@ rule association_dataset:
output:
'{phenotype}/deeprvat/association_dataset.pkl'
threads: 4
+ resources:
+ mem_mb = lambda wildcards, attempt: 32000 * (attempt + 1),
+ load = 64000
+ priority: 30
shell:
'deeprvat_associate make-dataset '
+ debug +
diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile
index 550390fa..a585d422 100644
--- a/pipelines/association_testing/burdens.snakefile
+++ b/pipelines/association_testing/burdens.snakefile
@@ -1,3 +1,33 @@
+rule average_burdens:
+ input:
+ chunks = [
+ (f'{p}/deeprvat/burdens/chunk{c}.' +
+ ("finished" if p == phenotypes[0] else "linked"))
+ for p in phenotypes
+ for c in range(n_burden_chunks)
+ ] if not cv_exp else '{phenotype}/deeprvat/burdens/merging.finished'
+ output:
+ '{phenotype}/deeprvat/burdens/logs/burdens_averaging_{chunk}.finished',
+ params:
+ burdens_in = '{phenotype}/deeprvat/burdens/burdens.zarr',
+ burdens_out = '{phenotype}/deeprvat/burdens/burdens_average.zarr',
+ repeats = lambda wildcards: ''.join([f'--repeats {r} ' for r in range(int(n_repeats))])
+ threads: 1
+ resources:
+ mem_mb = lambda wildcards, attempt: 4098 + (attempt - 1) * 4098,
+ load = 4000,
+ priority: 10,
+ shell:
+ ' && '.join([
+ ('deeprvat_associate average-burdens '
+ '--n-chunks '+ str(n_avg_chunks) + ' '
+ '--chunk {wildcards.chunk} '
+ '{params.repeats} '
+ '--agg-fct mean ' #TODO remove this
+ '{params.burdens_in} '
+ '{params.burdens_out}'),
+ 'touch {output}'
+ ])
rule link_burdens:
priority: 1
@@ -11,7 +41,12 @@ rule link_burdens:
model_config = model_path / 'config.yaml',
output:
'{phenotype}/deeprvat/burdens/chunk{chunk}.linked'
+ params:
+ prefix = '.'
threads: 8
+ resources:
+ mem_mb = lambda wildcards, attempt: 20480 + (attempt - 1) * 4098,
+ load = lambda wildcards, attempt: 16000 + (attempt - 1) * 4000
shell:
' && '.join([
('deeprvat_associate compute-burdens '
@@ -23,7 +58,7 @@ rule link_burdens:
'{input.data_config} '
'{input.model_config} '
'{input.checkpoints} '
- '{wildcards.phenotype}/deeprvat/burdens'),
+ '{params.prefix}/{wildcards.phenotype}/deeprvat/burdens'),
'touch {output}'
])
@@ -32,7 +67,7 @@ rule compute_burdens:
input:
reversed = model_path / "reverse_finished.tmp",
checkpoints = lambda wildcards: [
- model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt'
+ f'{model_path}/repeat_{repeat}/best/bag_{bag}.ckpt'
for repeat in range(n_repeats) for bag in range(n_bags)
],
dataset = '{phenotype}/deeprvat/association_dataset.pkl',
@@ -40,7 +75,13 @@ rule compute_burdens:
model_config = model_path / 'config.yaml',
output:
'{phenotype}/deeprvat/burdens/chunk{chunk}.finished'
+ params:
+ prefix = '.'
threads: 8
+ resources:
+ mem_mb = 2000000, # Using this value will tell our modified lsf.profile not to set a memory resource
+ load = 8000,
+ gpus = 1
shell:
' && '.join([
('deeprvat_associate compute-burdens '
@@ -51,7 +92,7 @@ rule compute_burdens:
'{input.data_config} '
'{input.model_config} '
'{input.checkpoints} '
- '{wildcards.phenotype}/deeprvat/burdens'),
+ '{params.prefix}/{wildcards.phenotype}/deeprvat/burdens'),
'touch {output}'
])
@@ -62,8 +103,11 @@ rule reverse_models:
model_config = model_path / 'config.yaml',
data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml",
output:
- temp(model_path / "reverse_finished.tmp")
+ model_path / "reverse_finished.tmp"
threads: 4
+ resources:
+ mem_mb = 20480,
+ load = 20480
shell:
" && ".join([
("deeprvat_associate reverse-models "
diff --git a/pipelines/association_testing/plot.snakefile b/pipelines/association_testing/plot.snakefile
new file mode 100644
index 00000000..562a3619
--- /dev/null
+++ b/pipelines/association_testing/plot.snakefile
@@ -0,0 +1,46 @@
+#requires that comparison_results.pkl is linked to the experiment directory
+#requires deeprvat-analyis to be installed
+DEEPRVAT_ANALYSIS_DIR = os.environ['DEEPRVAT_ANALYSIS_DIR']
+py_deeprvat_analysis = f'python {DEEPRVAT_ANALYSIS_DIR}'
+
+rule plot:
+ conda:
+ "r-env"
+ input:
+ significant = expand("{phenotype}/deeprvat/eval/significant.parquet",
+ phenotype=phenotypes),
+ results = expand("{phenotype}/deeprvat/eval/all_results.parquet",
+ phenotype=phenotypes),
+ replication = "replication.parquet"
+ output:
+ "dicovery_replication_plot.png"
+ params:
+ results_dir = './',
+ results_dir_pattern = '',
+ code_dir = f'{DEEPRVAT_ANALYSIS_DIR}/association_testing'
+ resources:
+ mem_mb=20480,
+ load=16000,
+ script:
+ f'{DEEPRVAT_ANALYSIS_DIR}/association_testing/figure_3_main.R'
+
+rule compute_replication:
+ input:
+ results = expand("{phenotype}/deeprvat/eval/all_results.parquet",
+ phenotype = training_phenotypes)
+ output:
+ 'replication.parquet'
+ params:
+ result_files = lambda wildcards, input: ''.join([
+ f'--result-files {f} '
+ for f in input.results
+ ]),
+ n_repeats = f'{n_repeats}'
+ resources:
+ mem_mb = lambda wildcards, attempt: 32000 + attempt * 4098 * 2,
+ shell:
+ py_deeprvat_analysis + '/association_testing/compute_replication.py '
+ '--out-file {output} '
+ '--n-repeats {params.n_repeats} '
+ '{params.result_files} '
+ './ '
\ No newline at end of file
diff --git a/pipelines/association_testing/regress_eval.snakefile b/pipelines/association_testing/regress_eval.snakefile
index bcb3f369..2f5325fb 100644
--- a/pipelines/association_testing/regress_eval.snakefile
+++ b/pipelines/association_testing/regress_eval.snakefile
@@ -1,63 +1,81 @@
-
+config_file_prefix = (
+ "cv_split0/deeprvat/" if cv_exp else ""
+)
+########### Average regression
rule evaluate:
input:
- associations = expand('{{phenotype}}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
- repeat=range(n_repeats)),
- config = '{phenotype}/deeprvat/hpopt_config.yaml',
+ associations ='{phenotype}/deeprvat/average_regression_results/burden_associations.parquet',
+ config = f"{config_file_prefix}{{phenotype}}/deeprvat/hpopt_config.yaml"
output:
"{phenotype}/deeprvat/eval/significant.parquet",
"{phenotype}/deeprvat/eval/all_results.parquet"
threads: 1
+ resources:
+ mem_mb = 16000,
+ load = 16000
+ params:
+ n_combis = 1,
+ use_baseline_results = '--use-baseline-results'
shell:
'deeprvat_evaluate '
+ debug +
- '--use-seed-genes '
- '--n-repeats {n_repeats} '
- '--correction-method FDR '
+ '{params.use_baseline_results} '
+ '--correction-method Bonferroni '
+ '--phenotype {wildcards.phenotype} '
'{input.associations} '
'{input.config} '
'{wildcards.phenotype}/deeprvat/eval'
-rule all_regression:
- input:
- expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
- phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)),
rule combine_regression_chunks:
input:
- expand('{{phenotype}}/deeprvat/repeat_{{repeat}}/results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)),
+ expand('{{phenotype}}/deeprvat/average_regression_results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)),
output:
- '{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+ '{phenotype}/deeprvat/average_regression_results/burden_associations.parquet',
threads: 1
+ resources:
+ mem_mb = lambda wildcards, attempt: 12000 + (attempt - 1) * 4098,
+ load = 2000
shell:
'deeprvat_associate combine-regression-results '
- '--model-name repeat_{wildcards.repeat} '
+ '--model-name repeat_0 '
'{input} '
'{output}'
+
rule regress:
input:
- config = "{phenotype}/deeprvat/hpopt_config.yaml",
- chunks = lambda wildcards: expand(
- ('{{phenotype}}/deeprvat/burdens/chunk{chunk}.' +
- ("finished" if wildcards.phenotype == phenotypes[0] else "linked")),
- chunk=range(n_burden_chunks)
- ),
+ config = f"{config_file_prefix}{{phenotype}}/deeprvat/hpopt_config.yaml",
+ chunks = lambda wildcards: (
+ [] if wildcards.phenotype == phenotypes[0]
+ else expand('{{phenotype}}/deeprvat/burdens/chunk{chunk}.linked',
+ chunk=range(n_burden_chunks))
+ ) if not cv_exp else '{phenotype}/deeprvat/burdens/merging.finished',
phenotype_0_chunks = expand(
- phenotypes[0] + '/deeprvat/burdens/chunk{chunk}.finished',
- chunk=range(n_burden_chunks)
+ phenotypes[0] + '/deeprvat/burdens/logs/burdens_averaging_{chunk}.finished',
+ chunk=range(n_avg_chunks)
),
output:
- temp('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations_{chunk}.parquet'),
+ temp('{phenotype}/deeprvat/average_regression_results/burden_associations_{chunk}.parquet'),
threads: 2
+ resources:
+ mem_mb = lambda wildcards, attempt: 28676 + (attempt - 1) * 4098,
+ # mem_mb = 16000,
+ load = lambda wildcards, attempt: 28000 + (attempt - 1) * 4000
+ params:
+ burden_file = f'{phenotypes[0]}/deeprvat/burdens/burdens_average.zarr',
+ burden_dir = '{phenotype}/deeprvat/burdens',
+ out_dir = '{phenotype}/deeprvat/average_regression_results'
shell:
'deeprvat_associate regress '
+ debug +
'--chunk {wildcards.chunk} '
'--n-chunks ' + str(n_regression_chunks) + ' '
'--use-bias '
- '--repeat {wildcards.repeat} '
+ '--repeat 0 '
+ '--burden-file {params.burden_file} '
+ do_scoretest +
'{input.config} '
- '{wildcards.phenotype}/deeprvat/burdens ' #TODO make this w/o repeats
- '{wildcards.phenotype}/deeprvat/repeat_{wildcards.repeat}/results'
\ No newline at end of file
+ '{params.burden_dir} ' #TODO make this w/o repeats
+ '{params.out_dir}'
+
diff --git a/pipelines/association_testing/regress_eval_regenie.snakefile b/pipelines/association_testing/regress_eval_regenie.snakefile
new file mode 100644
index 00000000..7cad2da4
--- /dev/null
+++ b/pipelines/association_testing/regress_eval_regenie.snakefile
@@ -0,0 +1,289 @@
+configfile: "config.yaml"
+
+debug_flag = config.get('debug', False)
+debug = '--debug ' if debug_flag else ''
+
+# n_repeats = config['n_repeats']
+
+phenotypes = config['phenotypes']
+phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes
+
+n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2
+
+regenie_config_step1 = config["regenie"]["step_1"]
+regenie_config_step2 = config["regenie"]["step_2"]
+regenie_step1_bsize = regenie_config_step1["bsize"]
+regenie_step2_bsize = regenie_config_step2["bsize"]
+regenie_njobs = regenie_config_step1.get("njobs", 1)
+regenie_joblist = range(1, regenie_njobs)
+
+
+wildcard_constraints:
+ job="\d+"
+
+
+# rule evaluate:
+# input:
+# associations = expand('{{phenotype}}/deeprvat/mean_agg_results/burden_associations.parquet',
+# repeat=range(n_repeats)),
+# config = '{phenotype}/deeprvat/hpopt_config.yaml',
+# output:
+# "{phenotype}/deeprvat/eval/significant.parquet",
+# "{phenotype}/deeprvat/eval/all_results.parquet"
+# threads: 1
+# shell:
+# 'deeprvat_evaluate '
+# + debug +
+# '--use-seed-genes '
+# '--n-repeats {n_repeats} '
+# '--correction-method FDR '
+# '{input.associations} '
+# '{input.config} '
+# '{wildcards.phenotype}/deeprvat/eval'
+
+rule all_regenie:
+ input:
+ expand('{phenotype}/deeprvat/mean_agg_results/burden_associations.parquet',
+ phenotype=phenotypes),
+
+rule convert_regenie_output:
+ input:
+ expand("regenie_output/step2/deeprvat_{phenotype}.regenie",
+ phenotype=phenotypes)
+ output:
+ expand('{phenotype}/deeprvat/mean_agg_results/burden_associations.parquet',
+ phenotype=phenotypes)
+ params:
+ pheno_options = " ".join([
+ f"--phenotype {phenotype} regenie_output/step2/deeprvat_{phenotype}.regenie "
+ f"{phenotype}/deeprvat/mean_agg_results/burden_associations.parquet"
+ for phenotype in phenotypes]),
+ gene_file = config["data"]["dataset_config"]["rare_embedding"]["config"]["gene_file"]
+ threads: 1
+ resources:
+ mem_mb = 2048
+ shell:
+ "deeprvat_associate convert-regenie-output "
+ "{params.pheno_options} "
+ "{params.gene_file}"
+
+rule regenie_step2:
+ input:
+ sample_file = "regenie_input/deeprvat_pseudovariants.sample",
+ bgen = "regenie_input/deeprvat_pseudovariants.bgen",
+ covariate_file = "regenie_input/covariates.txt",
+ phenotype_file = "regenie_input/phenotypes.txt",
+ step1_loco = expand("regenie_output/step1/deeprvat_{pheno_num}.loco",
+ pheno_num=range(1, len(phenotypes) + 1)),
+ step1_predlist = "regenie_output/step1/deeprvat_pred.list"
+ # step1_loco = expand("regenie_output/step1/deeprvat_l1_{pheno_number}.loco",
+ # pheno_number=range(len(phenotypes))),
+ # step1_predlist = "regenie_output/step1/deeprvat_l1_pred.list",
+ output:
+ expand("regenie_output/step2/deeprvat_{phenotype}.regenie",
+ phenotype=phenotypes)
+ threads: 16
+ resources:
+ mem_mb = 4096
+ shell:
+ "regenie "
+ "--step 2 "
+ "--bgen {input.bgen} "
+ "--ref-first "
+ "--sample {input.sample_file} "
+ "--phenoFile {input.phenotype_file} "
+ "--covarFile {input.covariate_file} "
+ "--pred {input.step1_predlist} "
+ f"--bsize {regenie_step2_bsize} "
+ "--threads 16 "
+ + " ".join(regenie_config_step2.get("options", [])) + " " +
+ "--out regenie_output/step2/deeprvat"
+
+rule regenie_step1:
+ input:
+ bgen = regenie_config_step1['bgen'],
+ sample_file = "regenie_input/deeprvat_pseudovariants.sample",
+ snplist = regenie_config_step1["snplist"],
+ covariate_file = "regenie_input/covariates.txt",
+ phenotype_file = "regenie_input/phenotypes.txt",
+ output:
+ expand("regenie_output/step1/deeprvat_{pheno_num}.loco",
+ pheno_num=range(1, len(phenotypes) + 1)),
+ "regenie_output/step1/deeprvat_pred.list"
+ threads: 24
+ resources:
+ mem_mb = 16000
+ shell:
+ "mkdir -p regenie_step1_tmp && "
+ "regenie "
+ "--step 1 "
+ "--bgen {input.bgen} "
+ "--extract {input.snplist} "
+ "--keep {input.sample_file} "
+ "--phenoFile {input.phenotype_file} "
+ "--covarFile {input.covariate_file} "
+ f"--bsize {regenie_step1_bsize} "
+ "--threads 24 "
+ "--lowmem "
+ "--lowmem-prefix regenie_step1_tmp/deeprvat "
+ + " ".join(regenie_config_step1.get("options", [])) + " " +
+ "--out regenie_output/step1/deeprvat ; "
+ "rm -rf regenie_step1_tmp"
+
+
+# rule regenie_step1_runl1:
+# input:
+# expand("regenie_output/step1/deeprvat_parallel_job{job}_l0_Y{pheno_number}",
+# job=regenie_joblist, pheno_number=range(1, len(phenotypes) + 1)),
+# bgen = regenie_config_step1['bgen'],
+# sample_file = "regenie_input/deeprvat_pseudovariants.sample",
+# snplist = regenie_config_step1["snplist"],
+# covariate_file = "regenie_input/covariates.txt",
+# phenotype_file = "regenie_input/phenotypes.txt",
+# output:
+# expand("regenie_output/step1/deeprvat_l1_{pheno_number}.loco",
+# pheno_number=range(len(phenotypes))),
+# "regenie_output/step1/deeprvat_l1_pred.list"
+# threads: 16
+# resources:
+# mem_mb = 16000
+# shell:
+# "regenie "
+# "--step 1 "
+# "--bgen {input.bgen} "
+# "--extract {input.snplist} "
+# "--keep {input.sample_file} "
+# "--phenoFile {input.phenotype_file} "
+# "--covarFile {input.covariate_file} "
+# f"--bsize {regenie_step1_bsize} "
+# "--lowmem "
+# "--lowmem-prefix regenie_step1_tmp "
+# "--threads 16 "
+# + " ".join(regenie_config_step1.get("options", [])) + " " +
+# "--out regenie_output/step1/deeprvat_l1 "
+# f"--run-l1 regenie_output/step1/deeprvat_parallel.master"
+
+# rule regenie_step1_runl0:
+# input:
+# master = "regenie_output/step1/deeprvat_parallel.master",
+# snplists = expand("regenie_output/step1/deeprvat_parallel_job{job}.snplist",
+# job=regenie_joblist),
+# bgen = regenie_config_step1['bgen'],
+# sample_file = "regenie_input/deeprvat_pseudovariants.sample",
+# covariate_file = "regenie_input/covariates.txt",
+# phenotype_file = "regenie_input/phenotypes.txt",
+# output:
+# expand("regenie_output/step1/deeprvat_parallel_job{{job}}_l0_Y{pheno_number}",
+# pheno_number=range(1, len(phenotypes) + 1))
+# threads: 8
+# resources:
+# mem_mb = 16000
+# shell:
+# " mkdir -p regenie_step1_tmp_job{wildcards.job} && "
+# "regenie "
+# "--step 1 "
+# "--bgen {input.bgen} "
+# "--keep {input.sample_file} "
+# "--phenoFile regenie_input/phenotypes.txt "
+# "--covarFile regenie_input/covariates.txt "
+# f"--bsize {regenie_step1_bsize} "
+# "--lowmem "
+# "--lowmem-prefix regenie_step1_tmp_job{wildcards.job} "
+# "--threads 8 "
+# + " ".join(regenie_config_step1.get("options", [])) + " " +
+# "--out regenie_output/step1/deeprvat "
+# "--run-l0 regenie_output/step1/deeprvat_parallel.master,{wildcards.job} && "
+# "rm -rf regenie_step1_tmp_job{wildcards.job}"
+
+# rule regenie_step1_splitl0:
+# input:
+# bgen = regenie_config_step1['bgen'],
+# sample_file = "regenie_input/deeprvat_pseudovariants.sample",
+# snplist = regenie_config_step1["snplist"],
+# covariate_file = "regenie_input/covariates.txt",
+# phenotype_file = "regenie_input/phenotypes.txt",
+# output:
+# "regenie_output/step1/deeprvat_parallel.master",
+# expand("regenie_output/step1/deeprvat_parallel_job{job}.snplist",
+# job=regenie_joblist)
+# threads: 8
+# resources:
+# mem_mb = 16000
+# shell:
+# "regenie "
+# "--step 1 "
+# "--bgen {input.bgen} "
+# "--extract {input.snplist} "
+# "--keep {input.sample_file} "
+# "--phenoFile {input.phenotype_file} "
+# "--covarFile {input.covariate_file} "
+# f"--bsize {regenie_step1_bsize} "
+# "--threads 8 "
+# + " ".join(regenie_config_step1.get("options", [])) + " " +
+# "--out regenie_output/step1/deeprvat "
+# f"--split-l0 regenie_output/step1/deeprvat_parallel,{regenie_njobs}"
+
+rule make_regenie_burdens:
+ input:
+ gene_file = config["data"]["dataset_config"]["rare_embedding"]["config"]["gene_file"],
+ gtf_file = config["gtf_file"],
+ burdens = [f'{phenotype}/deeprvat/burdens/chunk{chunk}.' +
+ ("finished" if phenotype == phenotypes[0] else "linked")
+ for phenotype in phenotypes
+ for chunk in range(n_burden_chunks)],
+ datasets = expand("{phenotype}/deeprvat/association_dataset.pkl",
+ phenotype=phenotypes),
+ params:
+ phenotypes = " ".join([f"--phenotype {p} {p}/deeprvat/association_dataset.pkl {p}/deeprvat/burdens"
+ for p in phenotypes]) + " "
+ output:
+ bgen = "regenie_input/deeprvat_pseudovariants.bgen",
+ threads: 8
+ resources:
+ mem_mb = 64000
+ shell:
+ "deeprvat_associate make-regenie-input "
+ + debug +
+ "--skip-samples "
+ "--skip-covariates "
+ "--skip-phenotypes "
+ "--average-repeats "
+ "{params.phenotypes}"
+ # "{input.dataset} "
+ # "{wildcards.phenotype}/deeprvat/burdens "
+ "--bgen {output.bgen} "
+ "{input.gene_file} "
+ "{input.gtf_file} "
+
+rule make_regenie_metadata:
+ input:
+ gene_file = config["data"]["dataset_config"]["rare_embedding"]["config"]["gene_file"],
+ gtf_file = config["gtf_file"],
+ burdens = [f'{phenotype}/deeprvat/burdens/chunk{chunk}.' +
+ ("finished" if phenotype == phenotypes[0] else "linked")
+ for phenotype in phenotypes
+ for chunk in range(n_burden_chunks)],
+ datasets = expand("{phenotype}/deeprvat/association_dataset.pkl",
+ phenotype=phenotypes),
+ params:
+ phenotypes = " ".join([f"--phenotype {p} {p}/deeprvat/association_dataset.pkl {p}/deeprvat/burdens"
+ for p in phenotypes]) + " "
+ output:
+ sample_file = "regenie_input/deeprvat_pseudovariants.sample",
+ covariate_file = "regenie_input/covariates.txt",
+ phenotype_file = "regenie_input/phenotypes.txt",
+ threads: 1
+ resources:
+ mem_mb = 16000
+ shell:
+ "deeprvat_associate make-regenie-input "
+ + debug +
+ "--skip-burdens "
+ "{params.phenotypes}"
+ # "{input.dataset} "
+ # "{wildcards.phenotype}/deeprvat/burdens "
+ "--sample-file {output.sample_file} "
+ "--covariate-file {output.covariate_file} "
+ "--phenotype-file {output.phenotype_file} "
+ "{input.gene_file} "
+ "{input.gtf_file} "
diff --git a/pipelines/association_testing_control_for_common_variants.snakefile b/pipelines/association_testing_control_for_common_variants.snakefile
new file mode 100644
index 00000000..37729478
--- /dev/null
+++ b/pipelines/association_testing_control_for_common_variants.snakefile
@@ -0,0 +1,222 @@
+from pathlib import Path
+
+configfile: 'config.yaml'
+
+debug_flag = config.get('debug', False)
+phenotypes = config['phenotypes']
+phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes
+training_phenotypes = config["training"].get("phenotypes", phenotypes)
+
+
+n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2
+
+n_regression_chunks = 1
+n_bags = config['training']['n_bags'] if not debug_flag else 3
+n_repeats = config['n_repeats']
+debug = '--debug ' if debug_flag else ''
+do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else ''
+
+
+DEEPRVAT_ANALYSIS_DIR=os.environ['DEEPRVAT_ANALYSIS_DIR']
+DEEPRVAT_DIR=os.environ['DEEPRVAT_DIR']
+
+py_deeprvat_analysis= f'python {DEEPRVAT_ANALYSIS_DIR}'
+py_deeprvat = f'python {DEEPRVAT_DIR}/deeprvat/deeprvat'
+
+wildcard_constraints:
+ repeat="\d+",
+ trial="\d+",
+
+cv_exp = True if os.path.exists('cv_split0/') else False
+config_file_prefix = 'cv_split0/deeprvat/' if cv_exp else '' #needed in case we analyse a CV experiment
+print(config_file_prefix)
+cv_splits = 5 #TODO make this more robust for non-cv experiment
+
+
+burden_agg_fct = 'mean'
+n_avg_repeats = 6
+use_seed = 'wo_seed'
+combi = 0
+
+
+phenotypes = training_phenotypes
+
+
+## specific for common variants
+phecode_dict = {'Apolipoprotein_A': 30630,
+ 'Apolipoprotein_B': 30640,
+ 'Calcium': 30680,
+ 'Cholesterol': 30690,
+ 'HDL_cholesterol': 30760,
+ 'IGF_1': 30770,
+ 'LDL_direct': 30780,
+ 'Lymphocyte_percentage': 30180,
+ 'Mean_corpuscular_volume': 30040,
+ 'Mean_platelet_thrombocyte_volume_(MPTV)': 30100,
+ 'Mean_reticulocyte_volume': 30260,
+ 'Neutrophill_count': 30140,
+ 'Platelet_count': 30080,
+ 'Platelet_crit': 30090,
+ 'Platelet_distribution_width': 30110,
+ 'Red_blood_cell_(erythrocyte)_count': 30010,
+ 'SHBG': 30830,
+ 'Standing_height': 50,
+ 'Total_bilirubin': 30840,
+ 'Triglycerides': 30870,
+ 'Urate': 30880,
+ 'Body_mass_index_BMI': 21001,
+ 'Glucose': 30740,
+ 'Vitamin_D': 30890,
+ 'Albumin': 30600,
+ 'Total_protein': 30860,
+ 'Cystatin_C': 30720,
+ 'Gamma_glutamyltransferase': 30730,
+ 'Alkaline_phosphatase': 30610,
+ 'Creatinine': 30700,
+ 'Whole_body_fat_free_mass': 23101,
+ 'Forced_expiratory_volume_in_1_second_FEV1': 20153,
+ 'Glycated_haemoglobin_HbA1c': 30750,
+ 'Mean_platelet_thrombocyte_volume': 30100,
+ 'Red_blood_cell_erythrocyte_count': 30010}
+
+
+
+gtf_file = 'gencode.v34lift37.annotation.gtf.gz'
+genotype_base_dir = 'genotypes/'
+padding = 500
+
+burden_phenotype = phenotypes[0]
+
+print('missing phenotypes')
+print(set(phenotypes) - set(phecode_dict.keys()))
+phenotypes = set(phenotypes).intersection(set(phecode_dict.keys()))
+print(f'number of kept phenotypes: {len(phenotypes)}')
+print(phenotypes)
+
+
+
+rule all_regression_correct_common:
+ input:
+ expand(f'{{phenotype}}/deeprvat/{burden_agg_fct}_agg_results/{n_avg_repeats}_repeats/combi_{combi}/burden_associations_common_variant_corrected.parquet',
+ phenotype = phenotypes
+ )
+
+
+
+rule regression_correct_common:
+ input:
+ config = f"{config_file_prefix}{{phenotype}}/deeprvat/hpopt_config.yaml",
+ chunks = lambda wildcards: (
+ [] if wildcards.phenotype == phenotypes[0]
+ else expand('{{phenotype}}/deeprvat/burdens/chunk{chunk}.linked',
+ chunk=range(n_burden_chunks))
+ ) if not cv_exp else '{phenotype}/deeprvat/burdens/merging.finished',
+ genes_to_keep = '{phenotype}/deeprvat/burdens/significant_genes_restest.npy',
+ common_variants = '{phenotype}/deeprvat/burdens/prepare_genotypes_per_gene.finished'
+ output:
+ '{phenotype}/deeprvat/{burden_agg_fct}_agg_results/{n_avg_repeats}_repeats/combi_{combi}/burden_associations_common_variant_corrected.parquet',
+ threads: 2
+ resources:
+ mem_mb = lambda wildcards, attempt: 28676 + (attempt - 1) * 4098,
+ load = lambda wildcards, attempt: 28000 + (attempt - 1) * 4000
+ params:
+ burden_file = f'{burden_phenotype}/deeprvat/burdens/burdens_{{burden_agg_fct}}_{{n_avg_repeats}}_{{combi}}.zarr',
+ burden_dir = '{phenotype}/deeprvat/burdens/',
+ out_dir = '{phenotype}/deeprvat/{burden_agg_fct}_agg_results/{n_avg_repeats}_repeats/combi_{combi}',
+ common_genotype_prefix = '{phenotype}/deeprvat/burdens/genotypes_gene'
+ shell:
+ 'deeprvat_associate regress-common '
+ + debug +
+ '--chunk 0 '
+ '--n-chunks 1 '
+ '--use-bias '
+ '--repeat 0 '
+ '--burden-file {params.burden_file} '
+ '--common-genotype-prefix {params.common_genotype_prefix} '
+ '--genes-to-keep {input.genes_to_keep} '
+ + do_scoretest +
+ '{input.config} '
+ '{params.burden_dir} ' #TODO make this w/o repeats
+ '{output}'
+
+rule all_data:
+ input:
+ expand('{phenotype}/deeprvat/burdens/prepare_genotypes_per_gene.finished',
+ phenotype = phenotypes
+ )
+
+rule prepare_genotypes_per_gene:
+ conda:
+ "prs" #TODO upgrade deeprvat environment pyarrow to version 6.0.1. to make DeepRVAT env work
+ input:
+ significant_genes = '{phenotype}/deeprvat/eval/significant_genes_restest.parquet',
+ config = 'config.yaml', #TODO potentially make this phenotype specific,
+ genotype_file = lambda wildcards: f'{genotype_base_dir}/GWAS_variants_clumped_mac_{phecode_dict[wildcards.phenotype]}.parquet',
+ sample_file = '{phenotype}/deeprvat/burdens/sample_ids.finished'
+ params:
+ out_dir = '{phenotype}/deeprvat/burdens/',
+ sample_file = '{phenotype}/deeprvat/burdens/sample_ids.zarr'
+ output:
+ '{phenotype}/deeprvat/burdens/prepare_genotypes_per_gene.finished'
+ threads: 16
+ resources:
+ mem_mb = 60000
+ # mem_mb = lambda wildcards, attempt: 60000 + (attempt - 1) * 4098,
+ shell:
+ ' && '.join([
+ (f'{py_deeprvat}/common_variant_condition_utils.py prepare-genotypes-per-gene '
+ '--gtf-file '+ str(gtf_file) + ' '
+ '--padding '+ str(padding) + ' '
+ '--standardize '
+ '{input.config} '
+ '{input.significant_genes} '
+ '{input.genotype_file} '
+ '{params.sample_file} '
+ '{params.out_dir} '),
+ 'touch {output}'
+ ])
+
+
+rule get_significant_genes:
+ input:
+ res_file = f"{{phenotype}}/deeprvat/{burden_agg_fct}_agg_results/{n_avg_repeats}_repeats/eval/{use_seed}/all_results.parquet",
+ config = 'config.yaml' #TODO potentially make this phenotype specific
+ output:
+ out_parquet = '{phenotype}/deeprvat/eval/significant_genes_restest.parquet',
+ out_npy = '{phenotype}/deeprvat/burdens/significant_genes_restest.npy'
+ threads: 2
+ resources:
+ mem_mb = lambda wildcards, attempt: 8000 + (attempt - 1) * 4098,
+ shell:
+ py_deeprvat + '/common_variant_condition_utils.py get-significant-genes '
+ '--pval-correction-method Bonferroni '
+ # f'{debug_flag} '
+ '{input.config} '
+ '{input.res_file} '
+ '{output.out_parquet} '
+ '{output.out_npy} '
+
+# this will be redundant in future since it is in newer versions of associate.pya
+
+rule get_ordered_sample_ids:
+ input:
+ dataset_pickle = expand('cv_split{split}/deeprvat/{{phenotype}}/deeprvat/association_dataset.pkl', split = range(cv_splits))
+ output:
+ '{phenotype}/deeprvat/burdens/sample_ids.finished'
+ params:
+ dataset_files = lambda wildcards, input: ''.join([
+ f'--dataset-files {f} '
+ for f in input.dataset_pickle
+ ]),
+ out_file = '{phenotype}/deeprvat/burdens/sample_ids.zarr'
+ threads: 8
+ resources:
+ mem_mb = lambda wildcards, attempt: 32000 + (attempt - 1) * 4098,
+ shell:
+ ' && '.join([
+ (py_deeprvat + '/get_ordered_sample_ids.py get-ordered-sample-ids '
+ '{params.dataset_files} '
+ '{params.out_file} '),
+ 'touch {output}'
+ ])
+
diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile
index d7aaa006..7895d27b 100644
--- a/pipelines/association_testing_pretrained.snakefile
+++ b/pipelines/association_testing_pretrained.snakefile
@@ -9,6 +9,7 @@ training_phenotypes = config["training"].get("phenotypes", phenotypes)
n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2
n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2
+n_avg_chunks = config.get('n_avg_chunks', 40)
n_trials = config['hyperparameter_optimization']['n_trials']
n_bags = config['training']['n_bags'] if not debug_flag else 3
n_repeats = config['n_repeats']
@@ -21,37 +22,49 @@ wildcard_constraints:
repeat="\d+",
trial="\d+",
+cv_exp = False
+config_file_prefix = (
+ "cv_split0/deeprvat/" if cv_exp else ""
+)
+
+
include: "training/config.snakefile"
include: "association_testing/association_dataset.snakefile"
include: "association_testing/burdens.snakefile"
include: "association_testing/regress_eval.snakefile"
-rule all:
+
+
+
+rule all_evaluate: #plot.snakefile
input:
- expand("{phenotype}/deeprvat/eval/significant.parquet",
- phenotype=phenotypes),
- expand("{phenotype}/deeprvat/eval/all_results.parquet",
- phenotype=phenotypes)
+ significant=expand(
+ "{phenotype}/deeprvat/eval/significant.parquet", phenotype=phenotypes
+ ),
+ results=expand(
+ "{phenotype}/deeprvat/eval/all_results.parquet", phenotype=phenotypes
+ ),
+
-rule all_burdens:
+rule all_regression: #regress_eval.snakefile
input:
- [
- (f'{p}/deeprvat/burdens/chunk{c}.' +
- ("finished" if p == phenotypes[0] else "linked"))
- for p in phenotypes
- for c in range(n_burden_chunks)
- ]
-
-rule all_association_dataset:
+ expand(
+ "{phenotype}/deeprvat/average_regression_results/burden_associations.parquet",
+ phenotype=phenotypes,
+ ),
+
+
+rule all_average_burdens: #burdens.snakefile
input:
- expand('{phenotype}/deeprvat/association_dataset.pkl',
- phenotype=phenotypes)
+ expand(
+ "{phenotype}/deeprvat/burdens/logs/burdens_averaging_{chunk}.finished",
+ chunk=range(n_avg_chunks),
+ phenotype=phenotypes[0],
+ ),
-rule all_config:
+rule all_config: #cv_training.snakefile
input:
- seed_genes = expand('{phenotype}/deeprvat/seed_genes.parquet',
- phenotype=phenotypes),
- config = expand('{phenotype}/deeprvat/hpopt_config.yaml',
- phenotype=phenotypes),
- baseline = expand('{phenotype}/deeprvat/baseline_results.parquet',
- phenotype=phenotypes),
+ expand(
+ "{phenotype}/deeprvat/hpopt_config.yaml",
+ phenotype=phenotypes,
+ ),
diff --git a/pipelines/association_testing_pretrained_regenie.snakefile b/pipelines/association_testing_pretrained_regenie.snakefile
new file mode 100644
index 00000000..f3eb0b0e
--- /dev/null
+++ b/pipelines/association_testing_pretrained_regenie.snakefile
@@ -0,0 +1,45 @@
+from pathlib import Path
+
+configfile: 'config.yaml'
+
+debug_flag = config.get('debug', False)
+phenotypes = config['phenotypes']
+phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes
+
+n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2
+n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2
+n_bags = config['training']['n_bags'] if not debug_flag else 3
+n_repeats = config['n_repeats']
+debug = '--debug ' if debug_flag else ''
+do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else ''
+model_path = Path(config.get("pretrained_model_path", "pretrained_models"))
+
+wildcard_constraints:
+ repeat="\d+",
+ trial="\d+",
+
+include: "association_testing/config.snakefile"
+include: "association_testing/association_dataset.snakefile"
+include: "association_testing/burdens.snakefile"
+include: "association_testing/regress_eval_regenie.snakefile"
+
+rule all:
+ input:
+ expand("{phenotype}/deeprvat/eval/significant.parquet",
+ phenotype=phenotypes),
+ expand("{phenotype}/deeprvat/eval/all_results.parquet",
+ phenotype=phenotypes)
+
+rule all_burdens:
+ input:
+ [
+ (f'{p}/deeprvat/burdens/chunk{c}.' +
+ ("finished" if p == phenotypes[0] else "linked"))
+ for p in phenotypes
+ for c in range(n_burden_chunks)
+ ]
+
+rule all_association_dataset:
+ input:
+ expand('{phenotype}/deeprvat/association_dataset.pkl',
+ phenotype=phenotypes)
diff --git a/pipelines/config/annotation_colnames_filling_values.yaml b/pipelines/config/annotation_colnames_filling_values.yaml
new file mode 100644
index 00000000..9977be93
--- /dev/null
+++ b/pipelines/config/annotation_colnames_filling_values.yaml
@@ -0,0 +1,73 @@
+annotation_column_names:
+ 'CADD_RAW' :
+ 'CADD_raw': 0
+ 'PrimateAI' :
+ 'PrimateAI_score' : 0
+ 'SpliceAI_delta_score' :
+ 'SpliceAI_delta_score' : 0
+ 'am_pathogenicity' :
+ 'alphamissense' : 0
+ 'af' :
+ 'combined_UKB_NFE_AF' : 0
+ 'maf_mb' :
+ 'combined_UKB_NFE_AF_MB' : 10000
+ 'maf' :
+ 'combined_UKB_NFE_MAF' : 0
+ 'Condel' :
+ 'condel_score' : 0
+ 'PolyPhen' :
+ 'polyphen_score' : 0
+ 'SIFT' :
+ 'sift_score' : 1
+ 'QKI_hg2' :
+ 'DeepRipe_plus_QKI_lip_hg2' : 0
+ 'QKI_k5' :
+ 'DeepRipe_plus_QKI_clip_k5' : 0
+ 'KHDRBS1_k5' :
+ 'DeepRipe_plus_KHDRBS1_clip_k5' : 0
+ 'ELAVL1_parclip' :
+ 'DeepRipe_plus_ELAVL1_parclip' : 0
+ 'TARDBP_parclip' :
+ 'DeepRipe_plus_TARDBP_parclip' : 0
+ 'HNRNPD_parclip' :
+ 'DeepRipe_plus_HNRNPD_parclip' : 0
+ 'MBNL1_parclip' :
+ 'DeepRipe_plus_MBNL1_parclip' : 0
+ 'QKI_parclip' :
+ 'DeepRipe_plus_QKI_parclip' : 0
+ 'Consequence_splice_acceptor_variant' :
+ 'Consequence_splice_acceptor_variant' : 0
+ 'Consequence_splice_donor_variant' :
+ 'Consequence_splice_donor_variant' : 0
+ 'Consequence_stop_gained' :
+ 'Consequence_stop_gained' : 0
+ 'Consequence_frameshift_variant' :
+ 'Consequence_frameshift_variant' : 0
+ 'Consequence_stop_lost' :
+ 'Consequence_stop_lost' : 0
+ 'Consequence_start_lost' :
+ 'Consequence_start_lost' : 0
+ 'Consequence_inframe_insertion' :
+ 'Consequence_inframe_insertion' : 0
+ 'Consequence_inframe_deletion' :
+ 'Consequence_inframe_deletion' : 0
+ 'Consequence_missense_variant' :
+ 'Consequence_missense_variant' : 0
+ 'Consequence_protein_altering_variant' :
+ 'Consequence_protein_altering_variant' : 0
+ 'Consequence_splice_region_variant' :
+ 'Consequence_splice_region_variant' : 0
+ 'DeepSEA_PC_1' :
+ 'DeepSEA_PC_1' : 0
+ 'DeepSEA_PC_2' :
+ 'DeepSEA_PC_2' : 0
+ 'DeepSEA_PC_3' :
+ 'DeepSEA_PC_3' : 0
+ 'DeepSEA_PC_4' :
+ 'DeepSEA_PC_4' : 0
+ 'DeepSEA_PC_5' :
+ 'DeepSEA_PC_5' : 0
+ 'DeepSEA_PC_6' :
+ 'DeepSEA_PC_6' : 0
+ 'AF' :
+ 'AF' : 0
diff --git a/pipelines/config/deeprvat_annotation_config.yaml b/pipelines/config/deeprvat_annotation_config.yaml
index 79cc7dfa..cc9f0b79 100644
--- a/pipelines/config/deeprvat_annotation_config.yaml
+++ b/pipelines/config/deeprvat_annotation_config.yaml
@@ -1,10 +1,11 @@
-fasta_dir : reference
-fasta_file_name : hg38.fa
# Uncomment to use module load for required tools
#bcftools_load_cmd : module load bcftools/1.9
#htslib_load_cmd : module load htslib/1.9
#perl_load_cmd : module load perl/5.20.2
#vep_load_cmd : module load vep/108.1
+fasta_dir : reference
+fasta_file_name : hg38.fa
+gtf_file_name : gencode.v44.annotation.gtf.gz
source_variant_file_pattern : test_vcf_data_c{chr}_b{block}
source_variant_file_type: 'vcf.gz'
@@ -12,29 +13,21 @@ source_variant_file_type: 'vcf.gz'
# comment out / remove to run on all chromosomes
included_chromosomes : ['21','22']
-metadata_dir : input_dir/vcf/metadata
-pvcf_blocks_file : pvcf_blocks.txt
source_variant_dir : input_dir/vcf
anno_tmp_dir : output_dir/annotations/tmp
anno_dir : output_dir/annotations
-vep_cache_dir : repo_dir/ensembl-vep/cache/vep109
+vep_cache_dir : repo_dir/ensembl-vep/cache/
vep_plugin_dir : repo_dir/ensembl-vep/Plugins
-spliceAI_snv_file : annotation_data/spliceAI/spliceai_scores.raw.snv.hg38.vcf.gz
-spliceAI_indel_file : annotation_data/spliceAI/spliceai_scores.raw.indel.hg38.vcf.gz
-primateAI_file : annotation_data/primateAI/PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz
-cadd_snv_file : annotation_data/cadd/whole_genome_SNVs.tsv.gz
-cadd_indel_file : annotation_data/cadd/gnomad.genomes.r3.0.indel.tsv.gz
absplice_repo_dir : repo_dir/absplice
-deeprvat_repo_dir : deeprvat_repo_dir
+deeprvat_repo_dir : ../..
kipoiveff_repo_dir : repo_dir/kipoi-veff2
faatpipe_repo_dir : repo_dir/faatpipe
vep_repo_dir : repo_dir/ensembl-vep
-vep_plugin_repo : repo_dir/VEP_plugins
-variant_file_path : preprocessing_workdir/norm/variants/variants.tsv.gz
-pybedtools_tmp_path : output_dir/annotations/tmp/pybedtools
-n_jobs_deepripe : 32
-n_cores_merge_absplice : 32
-n_cores_absplice : 32
-deepsea_pca_pickle_filepath : annotations/deepSea_pca/pca.pkl
-deepsea_pca_n_components: 100
+preprocessing_workdir : preprocessing_workdir
+additional_vep_plugin_cmds:
+ cadd : CADD,annotation_data/cadd/whole_genome_SNVs.tsv.gz,annotation_data/cadd/gnomad.genomes.r3.0.indel.tsv.gz
+ spliceAI : SpliceAI,snv=annotation_data/spliceAI/spliceai_scores.raw.snv.hg38.vcf.gz,indel=annotation_data/spliceAI/spliceai_scores.raw.indel.hg38.vcf.gz
+ primateAI : PrimateAI,annotation_data/primateAI/PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz
+ condel: Condel,repo_dir/ensembl-vep/Plugin/config/Condel/config,s,2
+ alphamissense : AlphaMissense,file=annotation_data/AlphaMissense/AlphaMissense_hg38.tsv.gz
\ No newline at end of file
diff --git a/pipelines/cv_training/cv_burdens.snakefile b/pipelines/cv_training/cv_burdens.snakefile
new file mode 100644
index 00000000..f618e136
--- /dev/null
+++ b/pipelines/cv_training/cv_burdens.snakefile
@@ -0,0 +1,106 @@
+rule all_cv_burdens:
+ input:
+ expand("{phenotype}/deeprvat/burdens/merging.finished", phenotype=phenotypes),
+
+
+# # ############################### Run DeepRVAT ##############################################################
+# # ###########################################################################################################
+module deeprvat_associate:
+ snakefile:
+ "../training_association_testing.snakefile"
+ # f"{DEEPRVAT_DIR}/pipelines/training_association_testing.snakefile"
+ # Wit the version below the module doesn't have the local Namespace
+ # alternative is to put the 'header'/variable definitions into all snakefiles
+ # "../association_testing/association_dataset.snakefile"
+ prefix:
+ "cv_split{cv_split}/deeprvat"
+ config:
+ config
+
+
+# # ############################### Computation of test set deeprvat burdens ##############################################################
+
+
+rule make_deeprvat_test_config:
+ input:
+ config_train="cv_split{cv_split}/deeprvat/{phenotype}/deeprvat/hpopt_config.yaml",
+ output:
+ config_test="cv_split{cv_split}/deeprvat/{phenotype}/deeprvat/hpopt_config_test.yaml",
+ shell:
+ " && ".join(
+ [
+ conda_check,
+ "deeprvat_cv_utils generate-test-config "
+ "--fold {wildcards.cv_split} "
+ f"--n-folds {cv_splits}"
+ " {input.config_train} {output.config_test}",
+ ]
+ )
+
+
+# generate the association data set from the test samples (as defined in the config)
+# pass the sample file here
+# then just use this data set nomrally for burden computation
+use rule association_dataset from deeprvat_associate as deeprvat_association_dataset with:
+ input:
+ config="cv_split{cv_split}/deeprvat/{phenotype}/deeprvat/hpopt_config_test.yaml",
+ output:
+ "cv_split{cv_split}/deeprvat/{phenotype}/deeprvat/association_dataset.pkl",
+ threads: 4
+
+
+suffix_dict = {p: "linked" if p != burden_phenotype else "finished" for p in phenotypes}
+
+
+rule combine_test_burdens:
+ input:
+ burdens=lambda wildcards: [
+ (
+ f"cv_split{cv_split}/deeprvat/{wildcards.phenotype}/deeprvat/burdens/chunk{c}.{suffix_dict[wildcards.phenotype]}"
+ )
+ for c in range(n_burden_chunks)
+ for cv_split in range(cv_splits)
+ ],
+ config="config.yaml",
+ output:
+ "{phenotype}/deeprvat/burdens/merging.finished",
+ params:
+ out_dir="{phenotype}/deeprvat/burdens",
+ burden_paths=lambda wildcards, input: "".join(
+ [
+ f"--burden-dirs cv_split{fold}/deeprvat/{wildcards.phenotype}/deeprvat/burdens "
+ for fold in range(cv_splits)
+ ]
+ ),
+ link=lambda wildcards: (
+ f"--link-burdens ../../../{burden_phenotype}/deeprvat/burdens/burdens.zarr"
+ if wildcards.phenotype != burden_phenotype
+ else " "
+ ),
+ resources:
+ mem_mb=lambda wildcards, attempt: 32000 + attempt * 4098 * 2,
+ shell:
+ " && ".join(
+ [
+ conda_check,
+ "deeprvat_cv_utils combine-test-set-burdens "
+ "{params.link} "
+ "{params.burden_paths} "
+ "{params.out_dir} "
+ "{input.config}",
+ "touch {output}",
+ ]
+ )
+
+
+use rule link_burdens from deeprvat_workflow as deeprvat_link_burdens with:
+ params:
+ prefix="cv_split{cv_split}/deeprvat",
+
+
+use rule compute_burdens from deeprvat_workflow as deeprvat_compute_burdens with:
+ params:
+ prefix="cv_split{cv_split}/deeprvat",
+
+
+use rule reverse_models from deeprvat_workflow as deeprvat_reverse_models
diff --git a/pipelines/cv_training/cv_training.snakefile b/pipelines/cv_training/cv_training.snakefile
new file mode 100644
index 00000000..be8bfaed
--- /dev/null
+++ b/pipelines/cv_training/cv_training.snakefile
@@ -0,0 +1,91 @@
+rule all_cv_training:
+ input:
+ expand('cv_split{cv_split}/deeprvat/models/repeat_{repeat}/best/bag_{bag}.ckpt',
+ bag=range(n_bags), repeat=range(n_repeats),
+ cv_split = range(cv_splits)),
+ expand('cv_split{cv_split}/deeprvat/models/repeat_{repeat}/config.yaml',
+ repeat=range(n_repeats),
+ cv_split = range(cv_splits))
+
+# make a config for each cv_split (specifying the samples for the current fold)
+rule spread_config:
+ input:
+ config = 'config.yaml'
+ output:
+ train = 'cv_split{cv_split}/deeprvat/config.yaml',
+ params:
+ out_path = 'cv_split{cv_split}/'
+ threads: 1
+ resources:
+ mem_mb = 1024,
+ load = 1000
+ shell:
+ ' && '.join([
+ conda_check,
+ 'deeprvat_cv_utils spread-config '
+ '-m deeprvat '
+ '--fold {wildcards.cv_split} '
+ # '--fold-specific-baseline '
+ f'--n-folds {cv_splits}'
+ ' {input.config} {params.out_path}'
+ ])
+
+
+# # ############################### Run DeepRVAT ##############################################################
+# # ###########################################################################################################
+module deeprvat_workflow:
+ snakefile:
+ "../training_association_testing.snakefile"
+ # f"{DEEPRVAT_DIR}/pipelines/training_association_testing_with_prefix.snakefile"
+ prefix:
+ 'cv_split{cv_split}/deeprvat'
+ config:
+ config
+
+# use rule * from deeprvat_workflow exclude config, evaluate, association_dataset, train, regress, best_training_run, compute_burdens, link_burdens as deeprvat_*
+
+use rule link_config from deeprvat_workflow as deeprvat_link_config
+
+use rule best_training_run from deeprvat_workflow as deeprvat_best_training_run with:
+ params:
+ prefix = 'cv_split{cv_split}/deeprvat'
+
+
+use rule train from deeprvat_workflow as deeprvat_train with:
+ priority: 1000
+ params:
+ prefix = 'cv_split{cv_split}/deeprvat',
+ phenotypes = " ".join( #TODO like need the prefix here as well
+ [f"--phenotype {p} "
+ f"cv_split{{cv_split}}/deeprvat/{p}/deeprvat/input_tensor.zarr "
+ f"cv_split{{cv_split}}/deeprvat/{p}/deeprvat/covariates.zarr "
+ f"cv_split{{cv_split}}/deeprvat/{p}/deeprvat/y.zarr"
+ for p in training_phenotypes])
+
+use rule training_dataset from deeprvat_workflow as deeprvat_training_dataset
+
+use rule training_dataset_pickle from deeprvat_workflow as deeprvat_training_dataset_pickle
+
+use rule config from deeprvat_workflow as deeprvat_config with:
+ input:
+ config = 'cv_split{cv_split}/deeprvat/config.yaml', # TODO: change this into cv specific config
+ baseline = lambda wildcards: [
+ str(Path(r['base']) /wildcards.phenotype / r['type'] /
+ 'eval/burden_associations.parquet')
+ for r in config['baseline_results']
+ ] if wildcards.phenotype in training_phenotypes else []
+ params:
+ baseline_results = lambda wildcards, input: ''.join([
+ f'--baseline-results {b} '
+ for b in input.baseline
+ ]) if wildcards.phenotype in training_phenotypes else ' ',
+ baseline_out = lambda wildcards: f'--baseline-results-out cv_split{wildcards.cv_split}/deeprvat/{wildcards.phenotype}/deeprvat/baseline_results.parquet' if wildcards.phenotype in training_phenotypes else ' ',
+ seed_genes_out = lambda wildcards: f'--seed-genes-out cv_split{wildcards.cv_split}/deeprvat/{wildcards.phenotype}/deeprvat/seed_genes.parquet' if wildcards.phenotype in training_phenotypes else ' '
+
+
+
+
+
+
+
+
diff --git a/pipelines/cv_training/cv_training_association_testing.snakefile b/pipelines/cv_training/cv_training_association_testing.snakefile
new file mode 100644
index 00000000..af551e8a
--- /dev/null
+++ b/pipelines/cv_training/cv_training_association_testing.snakefile
@@ -0,0 +1,105 @@
+from pathlib import Path
+
+
+configfile: "config.yaml"
+
+
+conda_check = 'conda info | grep "active environment"'
+
+debug_flag = config.get("debug", False)
+phenotypes = config["phenotypes"]
+phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes
+training_phenotypes = config["training"].get("phenotypes", phenotypes)
+burden_phenotype = phenotypes[0]
+
+n_burden_chunks = config.get("n_burden_chunks", 1) if not debug_flag else 2
+n_regression_chunks = config.get("n_regression_chunks", 40) if not debug_flag else 2
+n_avg_chunks = config.get('n_avg_chunks', 40)
+n_trials = config["hyperparameter_optimization"]["n_trials"]
+n_bags = config["training"]["n_bags"] if not debug_flag else 3
+n_repeats = config["n_repeats"]
+debug = "--debug " if debug_flag else ""
+do_scoretest = "--do-scoretest " if config.get("do_scoretest", False) else ""
+tensor_compression_level = config["training"].get("tensor_compression_level", 1)
+model_path = Path("models")
+n_parallel_training_jobs = config["training"].get("n_parallel_jobs", 1)
+
+
+wildcard_constraints:
+ repeat="\d+",
+ trial="\d+",
+
+
+cv_splits = config.get("n_folds", 5)
+cv_exp = True
+
+
+
+include: "../association_testing/plot.snakefile"
+include: "cv_training.snakefile"
+include: "cv_burdens.snakefile"
+include: "../association_testing/burdens.snakefile"
+include: "../association_testing/regress_eval.snakefile"
+
+
+
+
+rule all_plot: #plot.snakefile
+ input:
+ "dicovery_replication_plot.png",
+
+
+rule all_evaluate: #plot.snakefile
+ input:
+ significant=expand(
+ "{phenotype}/deeprvat/eval/significant.parquet", phenotype=phenotypes
+ ),
+ results=expand(
+ "{phenotype}/deeprvat/eval/all_results.parquet", phenotype=phenotypes
+ ),
+
+
+rule all_regression: #regress_eval.snakefile
+ input:
+ expand(
+ "{phenotype}/deeprvat/average_regression_results/burden_associations.parquet",
+ phenotype=phenotypes,
+ ),
+
+
+rule all_average_burdens: #burdens.snakefile
+ input:
+ expand(
+ "{phenotype}/deeprvat/burdens/logs/burdens_averaging_{chunk}.finished",
+ chunk=range(n_avg_chunks),
+ phenotype=phenotypes[0],
+ ),
+
+
+rule all_burdens: #cv_burdens.snakefile
+ input:
+ expand("{phenotype}/deeprvat/burdens/merging.finished", phenotype=phenotypes),
+
+
+rule all_training: #cv_training.snakefile
+ input:
+ expand(
+ "cv_split{cv_split}/deeprvat/models/repeat_{repeat}/best/bag_{bag}.ckpt",
+ bag=range(n_bags),
+ repeat=range(n_repeats),
+ cv_split=range(cv_splits),
+ ),
+ expand(
+ "cv_split{cv_split}/deeprvat/models/repeat_{repeat}/config.yaml",
+ repeat=range(n_repeats),
+ cv_split=range(cv_splits),
+ ),
+
+
+rule all_config: #cv_training.snakefile
+ input:
+ expand(
+ "cv_split{cv_split}/deeprvat/{phenotype}/deeprvat/hpopt_config.yaml",
+ phenotype=phenotypes,
+ cv_split=range(cv_splits),
+ ),
diff --git a/pipelines/resources/absplice.yaml b/pipelines/resources/absplice.yaml
new file mode 100755
index 00000000..08494895
--- /dev/null
+++ b/pipelines/resources/absplice.yaml
@@ -0,0 +1,31 @@
+name: absplice
+channels:
+ - conda-forge
+dependencies:
+ - python==3.8
+ - cython==3.0.2
+ - tensorflow==2.4.3
+ - pytest==7.4.2
+ - setuptools==68.2.2
+ - scipy==1.10.1
+ - pandas==2.0.3
+ - tqdm==4.66.1
+ - click==8.1.7
+ - pip==23.2.1
+ - pyarrow==4.0.0
+ - numpy==1.23
+ - seaborn==0.12.2
+ - scikit-learn==1.3.0
+ - bioconda::pyfaidx==0.7.2.1
+ - bioconda::pyranges==0.0.125
+ - bioconda::cyvcf2==0.30.16
+ - bioconda::tabix==1.11
+ - bioconda::snakemake==7.26.0
+ - bioconda::spliceai==1.3.1
+ - bioconda::kipoiseq=0.7.1
+ - deepdiff==6.5.0
+ - pip:
+ - mmsplice==2.4.0
+ - interpret == 0.2.7
+ - interpret-core == 0.2.7
+ - git+https://github.com/gagneurlab/splicemap.git@9e9831f32c221e850e26757a5f1c132dcd565640
diff --git a/pipelines/resources/absplice_config_cat.yaml b/pipelines/resources/absplice_config_cat.yaml
new file mode 100644
index 00000000..3ac0546a
--- /dev/null
+++ b/pipelines/resources/absplice_config_cat.yaml
@@ -0,0 +1,16 @@
+gene_map: ../../absplice/precomputed/GENE_MAP.tsv.gz
+
+# mapping of variants to individuals, result of variant filtering on provided vcfs
+variant_sample_map: ../data/resources/analysis_files/variant_sample_map/{vcf_id}_variant_sample_map.csv
+
+cat_count_table:
+ raw: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_count_table/tissue_cat={tissue_cat}_count_table_raw.csv
+ updated: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_count_table/tissue_cat={tissue_cat}_count_table.csv
+cat_outliers:
+ qual_filtered:
+ junction_level: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/qual_filtered/tissue_cat={tissue_cat}_junction_level_signif.csv
+ gene_level: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/qual_filtered/tissue_cat={tissue_cat}_gene_level.csv
+ combine_gene_junction:
+ gene_junction_signif: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/combine_gene_junction/gene_junction_signif/tissue_cat={tissue_cat}_signif.csv
+ outlier_with_variant: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/combine_gene_junction/outlier_with_variant/{vcf_id}_tissue_cat={tissue_cat}_outlier_with_variant.csv
+ minus_log10_pval: ../data/resources/analysis_files/absplice_rna_related_files/processed/cat_outliers/combine_gene_junction/minus_log10_pval/{vcf_id}_tissue_cat={tissue_cat}_FRASER_pval.csv
\ No newline at end of file
diff --git a/pipelines/resources/absplice_config_download.yaml b/pipelines/resources/absplice_config_download.yaml
new file mode 100644
index 00000000..18480e38
--- /dev/null
+++ b/pipelines/resources/absplice_config_download.yaml
@@ -0,0 +1,83 @@
+fasta:
+ hg19:
+ url: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz
+ file: 'GRCh37.primary_assembly.genome.fa'
+ hg38:
+ url: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/GRCh38.primary_assembly.genome.fa.gz
+ file: 'GRCh38.primary_assembly.genome.fa'
+
+gtf:
+ hg19:
+ url: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/GRCh37_mapping/gencode.v39lift37.annotation.gtf.gz
+ file: 'gencode.v39lift37.annotation.gtf'
+ coding_genes: 'hg19_coding_genes.csv'
+ hg38:
+ url: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/gencode.v39.annotation.gtf.gz
+ file: 'gencode.v39.annotation.gtf'
+ coding_genes: 'hg38_coding_genes.csv'
+
+splicemap_dir: 'splicemap_{genome}/'
+
+splicemap:
+ psi3: 'splicemap_{genome}/{tissue}_splicemap_psi3_method=kn_event_filter=median_cutoff.csv.gz'
+ psi5: 'splicemap_{genome}/{tissue}_splicemap_psi5_method=kn_event_filter=median_cutoff.csv.gz'
+
+spliceai_rocksdb:
+ hg19: 'spliceAI_grch37_chr{chromosome}.db'
+ hg38: 'spliceAI_grch38_chr{chromosome}.db'
+chromosomes: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']
+
+gnomad_rocksdb:
+ hg19: 'gnomad_maf_db_hg19/gnomad_maf_db_2.1.1.db'
+ hg38: 'gnomad_maf_db_hg38/gnomad_maf_db_3.1.2.db'
+
+all_available_splicemap_tissues:
+ - Adipose_Subcutaneous
+ - Adipose_Visceral_Omentum
+ - Adrenal_Gland
+ - Artery_Aorta
+ - Artery_Coronary
+ - Artery_Tibial
+ - Brain_Amygdala
+ - Brain_Anterior_cingulate_cortex_BA24
+ - Brain_Caudate_basal_ganglia
+ - Brain_Cerebellar_Hemisphere
+ - Brain_Cerebellum
+ - Brain_Cortex
+ - Brain_Frontal_Cortex_BA9
+ - Brain_Hippocampus
+ - Brain_Hypothalamus
+ - Brain_Nucleus_accumbens_basal_ganglia
+ - Brain_Putamen_basal_ganglia
+ - Brain_Spinal_cord_cervical_c_1
+ - Brain_Substantia_nigra
+ - Breast_Mammary_Tissue
+ - Cells_Cultured_fibroblasts
+ - Cells_EBV_transformed_lymphocytes
+ - Colon_Sigmoid
+ - Colon_Transverse
+ - Esophagus_Gastroesophageal_Junction
+ - Esophagus_Mucosa
+ - Esophagus_Muscularis
+ - Heart_Atrial_Appendage
+ - Heart_Left_Ventricle
+ - Kidney_Cortex
+ - Liver
+ - Lung
+ - Minor_Salivary_Gland
+ - Muscle_Skeletal
+ - Nerve_Tibial
+ - Ovary
+ - Pancreas
+ - Pituitary
+ - Prostate
+ - Skin_Not_Sun_Exposed_Suprapubic
+ - Skin_Sun_Exposed_Lower_leg
+ - Small_Intestine_Terminal_Ileum
+ - Spleen
+ - Stomach
+ - Testis
+ - Thyroid
+ - Uterus
+ - Vagina
+ - Whole_Blood
diff --git a/pipelines/resources/absplice_config_pred.yaml b/pipelines/resources/absplice_config_pred.yaml
new file mode 100644
index 00000000..75e02f8a
--- /dev/null
+++ b/pipelines/resources/absplice_config_pred.yaml
@@ -0,0 +1,9 @@
+splicing_pred:
+ mmsplice_splicemap: '{genome}/model_scores_from_absplice_features/{vcf_id}_MMSplice_SpliceMap.csv'
+ spliceai_vcf: '{genome}/model_scores_from_absplice_features/{vcf_id}_SpliceAI.vcf'
+ spliceai: '{genome}/model_scores_from_absplice_features/{vcf_id}_SpliceAI.csv'
+ absplice_dna: '{genome}/dna/{vcf_id}_AbSplice_DNA.csv'
+ delta_psi_inferred_from_cat: '{genome}/model_scores_from_absplice_features/{vcf_id}_tissue_cat={tissue_cat}_delta_psi_inferred_from_cat.csv'
+ cat_outliers: '{genome}/model_scores_from_absplice_features/{vcf_id}_tissue_cat={tissue_cat}_FRASER_pval.csv'
+ absplice_rna: '{genome}/{vcf_id}_tissue_cat={tissue_cat}_AbSplice_RNA.csv'
+ absplice_rna_with_dna_info: '{genome}/{vcf_id}_tissue_cat={tissue_cat}_AbSplice_all_info.csv'
\ No newline at end of file
diff --git a/pipelines/resources/absplice_dna.py b/pipelines/resources/absplice_dna.py
new file mode 100644
index 00000000..348eea49
--- /dev/null
+++ b/pipelines/resources/absplice_dna.py
@@ -0,0 +1,8 @@
+from absplice import SplicingOutlierResult
+
+splicing_result = SplicingOutlierResult(
+ df_mmsplice=snakemake.input["mmsplice_splicemap"],
+ df_spliceai=snakemake.input["spliceai"],
+)
+splicing_result.predict_absplice_dna(extra_info=snakemake.params["extra_info"])
+splicing_result._absplice_dna.to_csv(snakemake.output["absplice_dna"])
diff --git a/pipelines/resources/absplice_download.snakefile b/pipelines/resources/absplice_download.snakefile
new file mode 100644
index 00000000..2ccb8483
--- /dev/null
+++ b/pipelines/resources/absplice_download.snakefile
@@ -0,0 +1,146 @@
+import os
+from pathlib import Path
+
+
+genome = absplice_main_conf['genome']
+
+all_splicemap_tissues = absplice_main_conf['splicemap_tissues']
+if 'tissue_cat' in absplice_main_conf.keys():
+ all_splicemap_tissues.append(absplice_main_conf['tissue_cat'])
+all_splicemap_tissues = sorted(set(all_splicemap_tissues))
+all_splicemap_tissues = [
+ tissue for tissue in all_splicemap_tissues
+ if tissue in config_download['all_available_splicemap_tissues']]
+
+def splicemap5(wildcards):
+ path = Path(absplice_download_dir) / Path(config_download['splicemap_dir'])
+ splicemaps = [
+ path / f'{tissue}_splicemap_psi5_method=kn_event_filter=median_cutoff.csv.gz'
+ for tissue in all_splicemap_tissues
+ ]
+ splicemaps = [str(x) for x in splicemaps]
+ return splicemaps
+
+def splicemap3(wildcards):
+ path = Path(absplice_download_dir) / Path(config_download['splicemap_dir'])
+ splicemaps = [
+ path / f'{tissue}_splicemap_psi3_method=kn_event_filter=median_cutoff.csv.gz'
+ for tissue in all_splicemap_tissues
+ ]
+ splicemaps = [str(x) for x in splicemaps]
+ return splicemaps
+
+def splicemap_dir_name(filename):
+ return os.path.dirname(filename)
+
+splicemap_v_mapper = {
+ 'hg38': 'gtex_v8',
+ 'hg19': 'gtex_v7',
+}
+
+list_outputs = list()
+
+rule download_human_fasta:
+ params:
+ config_download['fasta'][genome]['url']
+ output:
+ Path(absplice_download_dir) / config_download['fasta'][genome]['file']
+ conda:
+ "absplice"
+ shell:
+ "wget -O - {params} | gunzip -c > {output}"
+list_outputs.append(Path(absplice_download_dir) /config_download['fasta'][genome]['file'])
+
+rule download_splicemaps:
+ params:
+ version = splicemap_v_mapper[absplice_main_conf['genome']],
+ dirname = splicemap_dir_name(Path(absplice_download_dir) / config_download['splicemap']['psi3'])
+ output:
+ splicemap_psi3 = Path(absplice_download_dir) / config_download['splicemap']['psi3'],
+ splicemap_psi5 = Path(absplice_download_dir) / config_download['splicemap']['psi5'],
+ conda:
+ "absplice"
+ shell:
+ "splicemap_download --version {params.version} --splicemap_dir {params.dirname} --tissues {wildcards.tissue}"
+list_outputs.append(
+ expand(Path(absplice_download_dir) /config_download['splicemap']['psi3'],
+ genome = absplice_main_conf['genome'], tissue = absplice_main_conf['splicemap_tissues'],
+ )
+)
+list_outputs.append(
+ expand(Path(absplice_download_dir) /config_download['splicemap']['psi5'],
+ genome = absplice_main_conf['genome'], tissue = absplice_main_conf['splicemap_tissues'],
+ ),
+)
+
+if absplice_main_conf['AbSplice_RNA'] == True:
+ rule download_human_gtf:
+ params:
+ config_download['gtf'][genome]['url']
+ output:
+ Path(absplice_download_dir) / config_download['gtf'][genome]['file']
+ conda:
+ "absplice"
+ shell:
+ "wget -O - {params} | gunzip -c > {output}"
+ list_outputs.append(Path(absplice_download_dir) /config_download['gtf'][genome]['file'])
+
+
+ rule coding_genes:
+ input:
+ gtf_file = Path(absplice_download_dir) / config_download['gtf'][genome]['file'],
+ output:
+ coding_genes = Path(absplice_download_dir) / config_download['gtf'][genome]['coding_genes']
+ conda:
+ "absplice"
+ resources:
+ mem_mb = lambda wildcards, attempt: attempt * 16000,
+ script:
+ "./coding_genes.py"
+ list_outputs.append(Path(absplice_download_dir) /config_download['gtf'][genome]['coding_genes'])
+
+
+ maf_version_mapper = {
+ 'hg38': '3.1.2',
+ 'hg19': '2.1.1',
+ }
+ rule download_gnomad_maf_db:
+ params:
+ version = maf_version_mapper[absplice_main_conf['genome']]
+ conda:
+ "./environment_gnomad_rocksdb.yaml"
+ output:
+ gnomad_maf_db = directory(Path(absplice_download_dir) /config_download['gnomad_rocksdb'][genome])
+ shell:
+ "gnomad_rocksdb_download --version {params.version} --db_path {output.gnomad_maf_db}"
+ list_outputs.append(Path(absplice_download_dir) /config_download['gnomad_rocksdb'][genome])
+
+if absplice_main_conf['use_rocksdb'] == True:
+ genome_mapper = {
+ 'hg38': 'grch38',
+ 'hg19': 'grch37',
+ }
+ rule download_rocksdb:
+ params:
+ version = genome_mapper[absplice_main_conf['genome']]
+ conda:
+ f"./environment_spliceai_rocksdb.yaml"
+ output:
+ spliceai_rocksdb = Path(absplice_download_dir) / directory(config_download['spliceai_rocksdb'][genome])
+ shell:
+ "spliceai_rocksdb_download --version {params.version} --db_path {output.spliceai_rocksdb} --chromosome {wildcards.chromosome}"
+ list_outputs.append(
+ expand(Path(absplice_download_dir) /config_download['spliceai_rocksdb'][genome],
+ chromosome = config_download['chromosomes'],
+ )
+ )
+
+
+rule all_download:
+ input:
+ list_outputs
+
+
+del splicemap5
+del splicemap3
+
diff --git a/pipelines/resources/absplice_splicing_pred_DNA.snakefile b/pipelines/resources/absplice_splicing_pred_DNA.snakefile
new file mode 100644
index 00000000..913780d3
--- /dev/null
+++ b/pipelines/resources/absplice_splicing_pred_DNA.snakefile
@@ -0,0 +1,134 @@
+from pathlib import Path
+
+genome = absplice_main_conf['genome']
+p = Path(vcf_dir)
+vcf_ids = [f.name for f in p.glob('*_variants_header.vcf.gz')]
+
+
+def splicemap5(wildcards):
+ path = Path(absplice_download_dir) / config_download['splicemap_dir']
+ splicemaps = [
+ path / f'{tissue}_splicemap_psi5_method=kn_event_filter=median_cutoff.csv.gz'
+ for tissue in absplice_main_conf['splicemap_tissues']
+ ]
+ splicemaps = [str(x) for x in splicemaps]
+ return splicemaps
+
+def splicemap3(wildcards):
+ path = Path(absplice_download_dir) / config_download['splicemap_dir']
+ splicemaps = [
+ path / f'{tissue}_splicemap_psi3_method=kn_event_filter=median_cutoff.csv.gz'
+ for tissue in absplice_main_conf['splicemap_tissues']
+ ]
+ splicemaps = [str(x) for x in splicemaps]
+ return splicemaps
+
+
+rule mmsplice_splicemap:
+ input:
+ vcf = vcf_id,
+ fasta = Path(absplice_download_dir) / config_download['fasta'][genome]['file'],
+ splicemap_5 = splicemap5,
+ splicemap_3 = splicemap3
+ resources:
+ mem_mb = 30_000,
+ threads = 4
+ conda:
+ "absplice"
+ output:
+ result = Path(absplice_output_dir)/ config_pred['splicing_pred']['mmsplice_splicemap']
+ script:
+ "./mmsplice_splicemap.py"
+
+
+
+
+if absplice_main_conf['use_rocksdb'] == True:
+ genome_mapper = {
+ 'hg38': 'grch38',
+ 'hg19': 'grch37',
+ }
+
+ def dict_path(wildcards):
+ paths = {}
+ genome = wildcards['genome']
+ for chr in config_download['chromosomes']:
+ paths[chr] = str(Path(absplice_download_dir) / config_download['spliceai_rocksdb'][genome].format(chromosome=chr))
+ return paths
+
+ rule spliceai:
+ resources:
+ mem_mb = lambda wildcards, attempt: attempt * 16000,
+ threads = 1,
+ gpu = 1,
+ input:
+ vcf = vcf_id,
+ fasta = str(Path(absplice_download_dir) / config_download['fasta'][genome]['file']),
+ spliceai_rocksdb = expand(Path(absplice_download_dir) / config_download['spliceai_rocksdb'][genome],
+ chromosome=config_download['chromosomes'])
+ params:
+ db_path = dict_path,
+ lookup_only = False,
+ genome = genome_mapper[absplice_main_conf['genome']]
+ conda:
+ f"./environment_spliceai_rocksdb.yaml"
+ output:
+ result = Path(absplice_output_dir)/config_pred['splicing_pred']['spliceai']
+ script:
+ "./spliceai.py"
+else:
+ genome_mapper = {
+ 'hg38': 'grch38',
+ 'hg19': 'grch37',
+ }
+ rule spliceai:
+ resources:
+ mem_mb = lambda wildcards, attempt: attempt * 16000,
+ threads = 1,
+ gpu = 1,
+ input:
+ vcf = vcf_id,
+ fasta = Path(absplice_download_dir) /config_download['fasta'][genome]['file']
+ params:
+ genome = genome_mapper[absplice_main_conf['genome']]
+ conda:
+ f"./environment_spliceai_rocksdb.yaml"
+ output:
+ result = config_pred['splicing_pred']['spliceai_vcf']
+ shell:
+ 'spliceai -I {input.vcf} -O {output.result} -R {input.fasta} -A {params.genome}'
+
+ rule spliceai_vcf_to_csv:
+ input:
+ spliceai_vcf = Path(absplice_output_dir) / config_pred['splicing_pred']['spliceai_vcf'],
+ output:
+ spliceai_csv = Path(absplice_output_dir) / config_pred['splicing_pred']['spliceai'],
+ conda:
+ "absplice"
+ run:
+ from absplice.utils import read_spliceai_vcf
+ df = read_spliceai_vcf(input.spliceai_vcf)
+ df.to_csv(output.spliceai_csv, index=False)
+
+rule absplice_dna:
+ resources:
+ mem_mb = lambda wildcards, attempt: attempt * 16_000
+ input:
+ mmsplice_splicemap = Path(absplice_output_dir) / config_pred['splicing_pred']['mmsplice_splicemap'],
+ spliceai = Path(absplice_output_dir) / config_pred['splicing_pred']['spliceai'],
+ params:
+ extra_info = absplice_main_conf['extra_info_dna']
+ conda:
+ "absplice"
+ output:
+ absplice_dna = absplice_output_dir / '{genome}' / 'dna' / '{vcf_id}_AbSplice_DNA.csv'
+ script:
+ "./absplice_dna.py"
+
+rule all_predict_dna:
+ input:
+ #expand([absplice_output_dir / absplice_main_conf['genome'] / 'dna' / source_variant_file_pattern],zip, chr=chromosomes, block=block)
+ expand([absplice_output_dir / absplice_main_conf['genome'] / 'dna' / '{file_stem}_AbSplice_DNA.csv'],file_stem=file_stems)
+
+del splicemap5
+del splicemap3
diff --git a/pipelines/resources/absplice_splicing_pred_RNA.snakefile b/pipelines/resources/absplice_splicing_pred_RNA.snakefile
new file mode 100644
index 00000000..9bd210ec
--- /dev/null
+++ b/pipelines/resources/absplice_splicing_pred_RNA.snakefile
@@ -0,0 +1,96 @@
+import os
+from os import listdir
+from pathlib import Path
+
+include: "../DNA/rare_variants/Snakefile"
+include: "./count_table/Snakefile"
+include: "./outliers/Snakefile"
+
+genome = absplice_main_conf['genome']
+vcf_dir = os.path.dirname(absplice_main_conf['vcf'])
+vcf_ids = [file for file in listdir(vcf_dir) if '.tbi' not in file]
+
+def splicemap5(wildcards):
+ path = Path(config_download['splicemap_dir'])
+ splicemaps = [
+ path / f'{tissue}_splicemap_psi5_method=kn_event_filter=median_cutoff.csv.gz'
+ for tissue in absplice_main_conf['splicemap_tissues']
+ ]
+ splicemaps = [str(x) for x in splicemaps]
+ return splicemaps
+
+def splicemap3(wildcards):
+ path = Path(config_download['splicemap_dir'])
+ splicemaps = [
+ path / f'{tissue}_splicemap_psi3_method=kn_event_filter=median_cutoff.csv.gz'
+ for tissue in absplice_main_conf['splicemap_tissues']
+ ]
+ splicemaps = [str(x) for x in splicemaps]
+ return splicemaps
+
+def splicemap_cat5(wildcards):
+ path = Path(config_download['splicemap_dir'])
+ return path / f'{wildcards.tissue_cat}_splicemap_psi5_method=kn_event_filter=median_cutoff.csv.gz'
+
+def splicemap_cat3(wildcards):
+ path = Path(config_download['splicemap_dir'])
+ return path / f'{wildcards.tissue_cat}_splicemap_psi3_method=kn_event_filter=median_cutoff.csv.gz'
+
+
+rule infer_delta_psi_from_cat:
+ input:
+ mmsplice_splicemap = config_pred['splicing_pred']['mmsplice_splicemap'],
+ splicemap_5 = splicemap5,
+ splicemap_3 = splicemap3,
+ splicemap_cat5 = splicemap_cat5,
+ splicemap_cat3 = splicemap_cat3,
+ var_samples_df = config_cat['variant_sample_map'],
+ cat_count_table = config_cat['cat_count_table']['updated'],
+ params:
+ tissue_cat = '{tissue_cat}',
+ resources:
+ mem_mb = lambda wildcards, attempt: attempt * 64000,
+ threads = 1
+ output:
+ result = config_pred['splicing_pred']['delta_psi_inferred_from_cat']
+ script:
+ "./infer_delta_psi_from_cat.py"
+
+
+rule absplice_rna:
+ input:
+ df_mmsplice = config_pred['splicing_pred']['mmsplice_splicemap'],
+ df_spliceai = config_pred['splicing_pred']['spliceai'],
+ df_mmsplice_cat = config_pred['splicing_pred']['delta_psi_inferred_from_cat'],
+ df_outliers_cat = config_pred['splicing_pred']['cat_outliers'],
+ var_samples_df = config_cat['variant_sample_map'],
+ params:
+ extra_info = absplice_main_conf['extra_info_rna']
+ output:
+ absplice_rna = config_pred['splicing_pred']['absplice_rna']
+ script:
+ "./absplice_rna.py"
+
+
+rule absplice:
+ input:
+ absplice_dna = config_pred['splicing_pred']['absplice_dna'],
+ absplice_rna = config_pred['splicing_pred']['absplice_rna'],
+ output:
+ absplice = config_pred['splicing_pred']['absplice_rna_with_dna_info']
+ script:
+ './absplice_all_info.py'
+
+
+rule all_predict_rna:
+ input:
+ expand(config_pred['splicing_pred']['delta_psi_inferred_from_cat'],
+ genome = absplice_main_conf['genome'],
+ vcf_id=vcf_ids, tissue_cat=absplice_main_conf['DROP']['DROP_group']),
+ expand(config_pred['splicing_pred']['absplice_rna'],
+ genome = absplice_main_conf['genome'],
+ vcf_id=vcf_ids, tissue_cat=absplice_main_conf['DROP']['DROP_group']),
+ expand(config_pred['splicing_pred']['absplice_rna_with_dna_info'],
+ genome = absplice_main_conf['genome'],
+ vcf_id=vcf_ids, tissue_cat=absplice_main_conf['DROP']['DROP_group']),
+
\ No newline at end of file
diff --git a/pipelines/resources/coding_genes.py b/pipelines/resources/coding_genes.py
new file mode 100644
index 00000000..a98a6068
--- /dev/null
+++ b/pipelines/resources/coding_genes.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import pyranges as pr
+
+gr = pr.read_gtf(snakemake.input["gtf_file"])
+gr = gr[(gr.Feature == "gene") & (gr.gene_type == "protein_coding")]
+df_genes = gr.df
+
+df_genes["gene_id_orig"] = df_genes["gene_id"]
+df_genes["PAR_Y"] = df_genes["gene_id"].apply(lambda x: "PAR_Y" in x)
+df_genes = df_genes[df_genes["PAR_Y"] == False]
+df_genes["gene_id"] = df_genes["gene_id"].apply(lambda x: x.split(".")[0])
+
+columns = [
+ "Chromosome",
+ "Start",
+ "End",
+ "Strand",
+ "gene_id",
+ "gene_id_orig",
+ "gene_name",
+ "gene_type",
+]
+df_genes[columns].to_csv(snakemake.output["coding_genes"], index=False)
diff --git a/pipelines/resources/config_absplice.yaml b/pipelines/resources/config_absplice.yaml
new file mode 100644
index 00000000..659ac4d9
--- /dev/null
+++ b/pipelines/resources/config_absplice.yaml
@@ -0,0 +1,90 @@
+# ============================= AbSplice-DNA related ================================
+
+# specify genome version hg19 or hg38
+genome: hg38
+extra_info_dna: False
+
+
+# parameters for variant filtering
+variant_filtering:
+ minor_allele_frequency:
+ apply_filter: False
+ filter_cutoff: 0.001
+ max_num_samples:
+ apply_filter: False
+ filter_cutoff: 2
+
+use_rocksdb: True
+
+splicemap_tissues:
+ - Adipose_Subcutaneous
+ - Adipose_Visceral_Omentum
+ - Adrenal_Gland
+ - Artery_Aorta
+ - Artery_Coronary
+ - Artery_Tibial
+ - Brain_Amygdala
+ - Brain_Anterior_cingulate_cortex_BA24
+ - Brain_Caudate_basal_ganglia
+ - Brain_Cerebellar_Hemisphere
+ - Brain_Cerebellum
+ - Brain_Cortex
+ - Brain_Frontal_Cortex_BA9
+ - Brain_Hippocampus
+ - Brain_Hypothalamus
+ - Brain_Nucleus_accumbens_basal_ganglia
+ - Brain_Putamen_basal_ganglia
+ - Brain_Spinal_cord_cervical_c_1
+ - Brain_Substantia_nigra
+ - Breast_Mammary_Tissue
+ - Cells_Cultured_fibroblasts
+ - Cells_EBV_transformed_lymphocytes
+ - Colon_Sigmoid
+ - Colon_Transverse
+ - Esophagus_Gastroesophageal_Junction
+ - Esophagus_Mucosa
+ - Esophagus_Muscularis
+ - Heart_Atrial_Appendage
+ - Heart_Left_Ventricle
+ - Kidney_Cortex
+ - Liver
+ - Lung
+ - Minor_Salivary_Gland
+ - Muscle_Skeletal
+ - Nerve_Tibial
+ - Ovary
+ - Pancreas
+ - Pituitary
+ - Prostate
+ - Skin_Not_Sun_Exposed_Suprapubic
+ - Skin_Sun_Exposed_Lower_leg
+ - Small_Intestine_Terminal_Ileum
+ - Spleen
+ - Stomach
+ - Testis
+ - Thyroid
+ - Uterus
+ - Vagina
+ - Whole_Blood
+
+
+# ============================= AbSplice-RNA related ================================
+
+AbSplice_RNA: False
+
+DROP:
+ geneAnnotation: 'v29'
+ DROP_group:
+ - 'Cells_Cultured_fibroblasts'
+ working_dir: ../data/resources/analysis_files/absplice_rna_related_files/DROP/
+ sample_annotation: processed_data/aberrant_splicing/annotations/{tissue_cat}.tsv
+ count_table: processed_data/aberrant_splicing/datasets/
+ outliers: processed_results/aberrant_splicing/results/{geneAnnotation}/
+
+
+cat_outlier_filtering:
+ padjustGene_cutoff: 0.1
+ padjust_junction_cutoff: 0.05
+ totalCounts_cutoff: 20
+ delta_psi_cutoff: 0.3
+ outlier_type: 'psi5__psi3__theta'
diff --git a/pipelines/resources/environment_FRASER.yaml b/pipelines/resources/environment_FRASER.yaml
new file mode 100755
index 00000000..3ef8ac03
--- /dev/null
+++ b/pipelines/resources/environment_FRASER.yaml
@@ -0,0 +1,7 @@
+name: fraser_test_run
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - python==3.8
+ - drop
\ No newline at end of file
diff --git a/pipelines/resources/environment_gnomad_rocksdb.yaml b/pipelines/resources/environment_gnomad_rocksdb.yaml
new file mode 100755
index 00000000..9df2c970
--- /dev/null
+++ b/pipelines/resources/environment_gnomad_rocksdb.yaml
@@ -0,0 +1,19 @@
+name: gnomad-rocksdb_test_run2
+channels:
+ - conda-forge
+dependencies:
+ - python>=3.6
+ - python-rocksdb>=0.7
+ - rocksdb
+ - cython
+ - pandas
+ - pip
+ - snakemake>=6.1
+ - click
+ - wget
+ - bioconda::cyvcf2==0.30.16
+ - pip:
+ - git+https://github.com/gagneurlab/gnomad_rocksdb.git
+ - git+https://github.com/gagneurlab/splicemap.git
+ - git+https://github.com/gagneurlab/absplice.git@master
+ - git+https://github.com/kipoi/kipoiseq.git@master
\ No newline at end of file
diff --git a/pipelines/resources/environment_spliceai_rocksdb.yaml b/pipelines/resources/environment_spliceai_rocksdb.yaml
new file mode 100755
index 00000000..f59f3261
--- /dev/null
+++ b/pipelines/resources/environment_spliceai_rocksdb.yaml
@@ -0,0 +1,22 @@
+name: spliceai-rocksdb
+channels:
+ - conda-forge
+dependencies:
+ - python==3.9.15
+ - python-rocksdb==0.7.0
+ - rocksdb==6.13.3
+ - tensorflow-gpu==2.6.2
+ - cudnn==8.8.0.121
+ - cudatoolkit==11.0.3
+ - pyarrow==6.0.1
+ - tqdm==4.65.0
+ - pip==23.2.1
+ - setuptools==68.0.0
+ - click==8.1.6
+ - pooch==1.7.0
+ - bioconda::snakemake==7.31.0
+ - bioconda::kipoiseq==0.7.1
+ - bioconda::spliceai==1.3.1
+ - bioconda::cyvcf2==0.30.16
+ - pip:
+ - git+https://github.com/gagneurlab/spliceai_rocksdb.git@0.0.1
\ No newline at end of file
diff --git a/pipelines/resources/mmsplice_splicemap.py b/pipelines/resources/mmsplice_splicemap.py
new file mode 100644
index 00000000..5d193d87
--- /dev/null
+++ b/pipelines/resources/mmsplice_splicemap.py
@@ -0,0 +1,11 @@
+from absplice import SpliceOutlier, SpliceOutlierDataloader
+
+dl = SpliceOutlierDataloader(
+ snakemake.input["fasta"],
+ snakemake.input["vcf"],
+ splicemap5=list(snakemake.input["splicemap_5"]),
+ splicemap3=list(snakemake.input["splicemap_3"]),
+)
+
+model = SpliceOutlier()
+model.predict_save(dl, snakemake.output["result"])
diff --git a/pipelines/resources/spliceai.py b/pipelines/resources/spliceai.py
new file mode 100644
index 00000000..71ab5ff5
--- /dev/null
+++ b/pipelines/resources/spliceai.py
@@ -0,0 +1,14 @@
+from spliceai_rocksdb.spliceAI import SpliceAI
+
+
+if snakemake.params["lookup_only"]:
+ model = SpliceAI(db_path=snakemake.params["db_path"])
+else:
+ model = SpliceAI(
+ snakemake.input["fasta"],
+ annotation=snakemake.params["genome"],
+ db_path=snakemake.params["db_path"],
+ )
+
+
+model.predict_save(snakemake.input["vcf"], snakemake.output["result"])
diff --git a/pipelines/seed_gene_discovery.snakefile b/pipelines/seed_gene_discovery.snakefile
index 7a93ac26..183c645e 100644
--- a/pipelines/seed_gene_discovery.snakefile
+++ b/pipelines/seed_gene_discovery.snakefile
@@ -140,7 +140,7 @@ rule regress_plof:
threads: 1
priority: 30
resources:
- mem_mb = lambda wildcards, attempt: 20000 + 2000 * attempt,
+ mem_mb=lambda wildcards, attempt: 20000 + 2000 * attempt,
load=8000,
# gpus = 1
shell:
@@ -176,7 +176,7 @@ rule regress_missense:
threads: 1
priority: 30
resources:
- mem_mb = lambda wildcards, attempt: 30000 + 6000 * attempt,
+ mem_mb=lambda wildcards, attempt: 30000 + 6000 * attempt,
load=8000,
# gpus = 1
shell:
@@ -249,6 +249,7 @@ rule config:
"{phenotype}/{vtype}/config.yaml",
params:
rare_maf=str(rare_maf),
+ maf_column="MAF",
threads: 1
resources:
mem_mb=1024,
@@ -261,7 +262,7 @@ rule config:
"seed_gene_pipeline update-config "
+ "--phenotype {wildcards.phenotype} "
+ "--variant-type {wildcards.vtype} "
- + "--maf-column MAF "
+ + "--maf-column {params.maf_column} "
+ "--rare-maf "
+ "{params.rare_maf}"
+ " {input.config} "
@@ -269,5 +270,3 @@ rule config:
),
]
)
-
-
diff --git a/pipelines/training/config.snakefile b/pipelines/training/config.snakefile
index 3c58a39d..93075fff 100644
--- a/pipelines/training/config.snakefile
+++ b/pipelines/training/config.snakefile
@@ -1,29 +1,45 @@
rule config:
input:
- config = 'config.yaml',
- baseline = lambda wildcards: [
- str(Path(r['base']) / wildcards.phenotype / r['type'] /
- 'eval/burden_associations.parquet')
- for r in config['baseline_results']
+ config="config.yaml",
+ baseline=lambda wildcards: [
+ str(
+ Path(r["base"])
+ / wildcards.phenotype
+ / r["type"]
+ / "eval/burden_associations.parquet"
+ )
+ for r in config["baseline_results"]
]
+ if wildcards.phenotype in training_phenotypes
+ else [],
output:
- seed_genes = '{phenotype}/deeprvat/seed_genes.parquet',
- config = '{phenotype}/deeprvat/hpopt_config.yaml',
- baseline = '{phenotype}/deeprvat/baseline_results.parquet',
+ # seed_genes = '{phenotype}/deeprvat/seed_genes.parquet',
+ config="{phenotype}/deeprvat/hpopt_config.yaml",
+ # baseline = '{phenotype}/deeprvat/baseline_results.parquet',
threads: 1
+ resources:
+ mem_mb=1024,
+ load=1000,
params:
- baseline_results = lambda wildcards, input: ''.join([
- f'--baseline-results {b} '
- for b in input.baseline
- ])
+ baseline_results=lambda wildcards, input: "".join(
+ [f"--baseline-results {b} " for b in input.baseline]
+ )
+ if wildcards.phenotype in training_phenotypes
+ else " ",
+ seed_genes_out=lambda wildcards: f"--seed-genes-out {wildcards.phenotype}/deeprvat/seed_genes.parquet"
+ if wildcards.phenotype in training_phenotypes
+ else " ",
+ baseline_out=lambda wildcards: f"--baseline-results-out {wildcards.phenotype}/deeprvat/baseline_results.parquet"
+ if wildcards.phenotype in training_phenotypes
+ else " ",
shell:
(
- 'deeprvat_config update-config '
- '--phenotype {wildcards.phenotype} '
- '{params.baseline_results}'
- '--baseline-results-out {output.baseline} '
- '--seed-genes-out {output.seed_genes} '
- '{input.config} '
- '{output.config}'
- )
\ No newline at end of file
+ "deeprvat_config update-config "
+ "--phenotype {wildcards.phenotype} "
+ "{params.baseline_results}"
+ "{params.baseline_out} "
+ "{params.seed_genes_out} "
+ "{input.config} "
+ "{output.config}"
+ )
diff --git a/pipelines/training/train.snakefile b/pipelines/training/train.snakefile
index c747fd1f..2915c6ae 100644
--- a/pipelines/training/train.snakefile
+++ b/pipelines/training/train.snakefile
@@ -1,4 +1,3 @@
-
rule link_config:
input:
model_path / 'repeat_0/config.yaml'
@@ -6,8 +5,8 @@ rule link_config:
model_path / 'config.yaml'
threads: 1
shell:
- "ln -s repeat_0/config.yaml {output}"
-
+ "ln -rfs {input} {output}"
+ # "ln -s repeat_0/config.yaml {output}"
rule best_training_run:
input:
@@ -17,14 +16,19 @@ rule best_training_run:
checkpoints = expand(model_path / 'repeat_{{repeat}}/best/bag_{bag}.ckpt',
bag=range(n_bags)),
config = model_path / 'repeat_{repeat}/config.yaml'
+ params:
+ prefix = '.'
threads: 1
+ resources:
+ mem_mb = 2048,
+ load = 2000
shell:
(
'deeprvat_train best-training-run '
+ debug +
- '{model_path}/repeat_{wildcards.repeat} '
- '{model_path}/repeat_{wildcards.repeat}/best '
- '{model_path}/repeat_{wildcards.repeat}/hyperparameter_optimization.db '
+ '{params.prefix}/{model_path}/repeat_{wildcards.repeat} '
+ '{params.prefix}/{model_path}/repeat_{wildcards.repeat}/best '
+ '{params.prefix}/{model_path}/repeat_{wildcards.repeat}/hyperparameter_optimization.db '
'{output.config}'
)
@@ -49,7 +53,13 @@ rule train:
f"{p}/deeprvat/input_tensor.zarr "
f"{p}/deeprvat/covariates.zarr "
f"{p}/deeprvat/y.zarr"
- for p in training_phenotypes])
+ for p in training_phenotypes]),
+ prefix = '.',
+ priority: 1000
+ resources:
+ mem_mb = 2000000, # Using this value will tell our modified lsf.profile not to set a memory resource
+ load = 8000,
+ gpus = 1
shell:
f"parallel --jobs {n_parallel_training_jobs} --halt now,fail=1 --results train_repeat{{{{1}}}}_trial{{{{2}}}}/ "
'deeprvat_train train '
@@ -57,8 +67,8 @@ rule train:
'--trial-id {{2}} '
"{params.phenotypes} "
'config.yaml '
- '{model_path}/repeat_{{1}}/trial{{2}} '
- '{model_path}/repeat_{{1}}/hyperparameter_optimization.db "&&" '
- 'touch {model_path}/repeat_{{1}}/trial{{2}}/finished.tmp '
+ '{params.prefix}/{model_path}/repeat_{{1}}/trial{{2}} '
+ "{params.prefix}/{model_path}/repeat_{{1}}/hyperparameter_optimization.db '&&' "
+ "touch {params.prefix}/{model_path}/repeat_{{1}}/trial{{2}}/finished.tmp "
"::: " + " ".join(map(str, range(n_repeats))) + " "
"::: " + " ".join(map(str, range(n_trials)))
diff --git a/pipelines/training/training_dataset.snakefile b/pipelines/training/training_dataset.snakefile
index 66903b85..2cf00229 100644
--- a/pipelines/training/training_dataset.snakefile
+++ b/pipelines/training/training_dataset.snakefile
@@ -1,37 +1,45 @@
-
rule training_dataset:
input:
- config = '{phenotype}/deeprvat/hpopt_config.yaml',
- training_dataset = '{phenotype}/deeprvat/training_dataset.pkl'
+ config="{phenotype}/deeprvat/hpopt_config.yaml",
+ training_dataset="{phenotype}/deeprvat/training_dataset.pkl",
output:
- input_tensor = directory('{phenotype}/deeprvat/input_tensor.zarr'),
- covariates = directory('{phenotype}/deeprvat/covariates.zarr'),
- y = directory('{phenotype}/deeprvat/y.zarr')
+ input_tensor=directory("{phenotype}/deeprvat/input_tensor.zarr"),
+ covariates=directory("{phenotype}/deeprvat/covariates.zarr"),
+ y=directory("{phenotype}/deeprvat/y.zarr"),
threads: 8
- priority: 50
+ resources:
+ mem_mb=lambda wildcards, attempt: 32000 + 12000 * attempt,
+ load=16000,
+ priority: 5000
shell:
(
- 'deeprvat_train make-dataset '
- + debug +
- '--compression-level ' + str(tensor_compression_level) + ' '
- '--training-dataset-file {input.training_dataset} '
- '{input.config} '
- '{output.input_tensor} '
- '{output.covariates} '
- '{output.y}'
+ "deeprvat_train make-dataset "
+ + debug
+ + "--compression-level "
+ + str(tensor_compression_level)
+ + " "
+ "--training-dataset-file {input.training_dataset} "
+ "{input.config} "
+ "{output.input_tensor} "
+ "{output.covariates} "
+ "{output.y}"
)
+
rule training_dataset_pickle:
input:
- '{phenotype}/deeprvat/hpopt_config.yaml'
+ "{phenotype}/deeprvat/hpopt_config.yaml",
output:
- '{phenotype}/deeprvat/training_dataset.pkl'
+ "{phenotype}/deeprvat/training_dataset.pkl",
threads: 1
+ resources:
+ mem_mb=40000, # lambda wildcards, attempt: 38000 + 12000 * attempt
+ load=16000,
shell:
(
- 'deeprvat_train make-dataset '
- '--pickle-only '
- '--training-dataset-file {output} '
- '{input} '
- 'dummy dummy dummy'
+ "deeprvat_train make-dataset "
+ "--pickle-only "
+ "--training-dataset-file {output} "
+ "{input} "
+ "dummy dummy dummy"
)
\ No newline at end of file
diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile
index 60384eaf..8c360136 100644
--- a/pipelines/training_association_testing.snakefile
+++ b/pipelines/training_association_testing.snakefile
@@ -9,6 +9,7 @@ training_phenotypes = config["training"].get("phenotypes", phenotypes)
n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2
n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2
+n_avg_chunks = config.get('n_avg_chunks', 40)
n_trials = config['hyperparameter_optimization']['n_trials']
n_bags = config['training']['n_bags'] if not debug_flag else 3
n_repeats = config['n_repeats']
@@ -17,6 +18,7 @@ do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else ''
tensor_compression_level = config['training'].get('tensor_compression_level', 1)
model_path = Path("models")
n_parallel_training_jobs = config["training"].get("n_parallel_jobs", 1)
+cv_exp = False
wildcard_constraints:
repeat="\d+",
@@ -50,6 +52,10 @@ rule all_association_dataset:
expand('{phenotype}/deeprvat/association_dataset.pkl',
phenotype=phenotypes)
+rule all_reversed:
+ input:
+ model_path / "reverse_finished.tmp",
+
rule all_training:
input:
expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt',
@@ -67,9 +73,5 @@ rule all_training_dataset:
rule all_config:
input:
- seed_genes = expand('{phenotype}/deeprvat/seed_genes.parquet',
- phenotype=phenotypes),
config = expand('{phenotype}/deeprvat/hpopt_config.yaml',
phenotype=phenotypes),
- baseline = expand('{phenotype}/deeprvat/baseline_results.parquet',
- phenotype=phenotypes),
diff --git a/pipelines/training_association_testing_regenie.snakefile b/pipelines/training_association_testing_regenie.snakefile
new file mode 100644
index 00000000..3f8a4e01
--- /dev/null
+++ b/pipelines/training_association_testing_regenie.snakefile
@@ -0,0 +1,75 @@
+from pathlib import Path
+
+configfile: 'config.yaml'
+
+debug_flag = config.get('debug', False)
+phenotypes = config['phenotypes']
+phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes
+training_phenotypes = config["training"].get("phenotypes", phenotypes)
+
+n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2
+n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2
+n_trials = config['hyperparameter_optimization']['n_trials']
+n_bags = config['training']['n_bags'] if not debug_flag else 3
+n_repeats = config['n_repeats']
+debug = '--debug ' if debug_flag else ''
+do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else ''
+tensor_compression_level = config['training'].get('tensor_compression_level', 1)
+model_path = Path("models")
+n_parallel_training_jobs = config["training"].get("n_parallel_jobs", 1)
+
+wildcard_constraints:
+ repeat="\d+",
+ trial="\d+",
+
+include: "training/config.snakefile"
+include: "training/training_dataset.snakefile"
+include: "training/train.snakefile"
+include: "association_testing/association_dataset.snakefile"
+include: "association_testing/burdens.snakefile"
+include: "association_testing/regress_eval_regenie.snakefile"
+
+rule all:
+ input:
+ expand("{phenotype}/deeprvat/eval/significant.parquet",
+ phenotype=phenotypes),
+ expand("{phenotype}/deeprvat/eval/all_results.parquet",
+ phenotype=phenotypes)
+
+rule all_burdens:
+ input:
+ [
+ (f'{p}/deeprvat/burdens/chunk{c}.' +
+ ("finished" if p == phenotypes[0] else "linked"))
+ for p in phenotypes
+ for c in range(n_burden_chunks)
+ ]
+
+rule all_association_dataset:
+ input:
+ expand('{phenotype}/deeprvat/association_dataset.pkl',
+ phenotype=phenotypes)
+
+rule all_training:
+ input:
+ expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt',
+ bag=range(n_bags), repeat=range(n_repeats)),
+ model_path / "config.yaml"
+
+rule all_training_dataset:
+ input:
+ input_tensor = expand('{phenotype}/deeprvat/input_tensor.zarr',
+ phenotype=training_phenotypes, repeat=range(n_repeats)),
+ covariates = expand('{phenotype}/deeprvat/covariates.zarr',
+ phenotype=training_phenotypes, repeat=range(n_repeats)),
+ y = expand('{phenotype}/deeprvat/y.zarr',
+ phenotype=training_phenotypes, repeat=range(n_repeats))
+
+rule all_config:
+ input:
+ seed_genes = expand('{phenotype}/deeprvat/seed_genes.parquet',
+ phenotype=phenotypes),
+ config = expand('{phenotype}/deeprvat/hpopt_config.yaml',
+ phenotype=phenotypes),
+ baseline = expand('{phenotype}/deeprvat/baseline_results.parquet',
+ phenotype=phenotypes),
diff --git a/scripts/make_unrelated_cv_splits.R b/scripts/make_unrelated_cv_splits.R
new file mode 100644
index 00000000..74bb5fbb
--- /dev/null
+++ b/scripts/make_unrelated_cv_splits.R
@@ -0,0 +1,201 @@
+
+library("ukbtools")
+library(dplyr)
+library(ggplot2)
+library(arrow)
+library(stringr)
+library(tidyr)
+
+
+
+phenotype_df = read_parquet('~/ukbb/exomes/vcf/preprocessed/genotypes_phenotypes.parquet') #uses ids from old application
+
+sample_mapping = read.csv('~/ukbb/metadata/sample_map_ukb44975_ukb673180.csv') %>% select(-X)
+
+all_samples = phenotype_df %>% pull('samples')
+
+#################################################
+##### LOAD Kinship matrix and map to samples
+
+ukb_relatedness = read.csv('~/ukbb/metadata/ukb_rel_a81358_s488120.dat', sep = ' ')
+ukb_relatedness %>% arrange(Kinship)
+
+#map kinship ids from new application (673180) to old application (44975)
+kinship_mapped = ukb_relatedness %>% left_join(sample_mapping, by = c('ID1' = 'id_673180'))
+kinship_mapped[is.na(kinship_mapped[['id_44975']]),]
+kinship_mapped = kinship_mapped %>% select(-ID1) %>% rename('ID1' = 'id_44975') %>%
+ left_join(sample_mapping, by = c('ID2' = 'id_673180'))
+kinship_mapped[is.na(kinship_mapped[['id_44975']]),]
+
+kinship_mapped = kinship_mapped %>% select(-ID2) %>% rename('ID2' = 'id_44975')
+
+#################################################
+### get sets of related samples
+
+library(igraph)
+
+# Sample data
+df <- ukb_gen_related_with_data(kinship_mapped, ukb_with_data = all_samples,
+ cutoff = 0.0884) %>% select(ID1, ID2)
+
+# Create a graph from the data
+graph <- graph_from_data_frame(df, directed = FALSE)
+
+# Find connected components
+connected_sets <- components(graph)
+
+# Get the sets of connected IDs
+connected_id_sets <- lapply(connected_sets$membership, function(m) {
+ V(graph)$name[m]
+})
+
+# Display the connected sets
+
+related_set_df <- tibble(
+ ID = names(connected_id_sets),
+ set_id = unlist(connected_id_sets),
+)
+
+related_set_size = related_set_df %>%
+ group_by(set_id) %>%
+ summarize(size = n()) %>%
+ arrange(desc(size)) %>%
+ mutate(set_idx = row_number())
+summary(related_set_size)
+
+#each ID should only be assigned to a single set
+related_set_df %>%
+ group_by(ID) %>%
+ summarise(n_dist = n_distinct(set_id)) %>%
+ distinct(n_dist)
+
+
+#################################################
+
+#################################################
+
+### until here everything is fold independent
+### from here it gets fold dependent
+
+### distribute the related sets across folds
+
+library(reticulate)
+assignSamplesToFolds = function(n_folds, cv_splits_out_dir){
+ ordered_ids = related_set_size[['set_idx']]
+ result_lists <- split(ordered_ids, (ordered_ids - 1) %% n_folds + 1)
+
+
+ assigned_folds <- data.frame(
+ fold = rep(names(result_lists), sapply(result_lists, length)),
+ set_idx = unlist(result_lists)
+ ) %>%
+ left_join(related_set_size %>% select(-size))
+
+ # assign sample ids to folds via the set ids
+ sample_id_fold_assignment = related_set_df %>% left_join(assigned_folds)
+ sample_id_fold_assignment %>% group_by(fold) %>%
+ summarize(size = n())
+
+ ##### ##### ##### ##### ##### ##### #####
+ # get remaning ids
+ set.seed(123)
+
+ assigned_related_samples = sample_id_fold_assignment[['ID']]
+ left_samples = sample(setdiff(all_samples, assigned_related_samples))
+ n_all_samples = length(all_samples)
+
+ stopifnot(length(left_samples) + length(assigned_related_samples) == n_all_samples)
+
+ min_fold_size = floor(n_all_samples / n_folds)
+ stopifnot(min_fold_size * n_folds <= n_all_samples)
+
+ ##### ##### ##### ##### ##### ##### #####
+ # get current fold sizes (after assigning related samples) and number of samples that are still needed
+ fold_sizes = sample_id_fold_assignment %>%
+ select(fold, ID) %>%
+ distinct() %>%
+ group_by(fold) %>%
+ mutate(fold_size = n()) %>%
+ ungroup() %>%
+ select(-ID) %>%
+ distinct() %>%
+ mutate(left_samples = min_fold_size - fold_size,
+ fold = as.numeric(fold)) %>%
+ arrange(fold)
+
+ ##### ##### ##### ##### ##### ##### #####
+ # assign shuffled, remaning samples (non related samples)
+ folds_left_samples = c()
+ dist_samples = c()
+ start_idx = 1
+ fold = 1
+ for (size in fold_sizes[['left_samples']]){
+ end_idx = start_idx + size - 1
+ cat('size:', size, 'start_idx:', start_idx, 'end_idx:', end_idx, '\n')
+ this_samples = left_samples[start_idx:end_idx]
+ print(length(this_samples))
+ print(length(this_samples) == size)
+ start_idx = end_idx + 1
+ folds_left_samples[[as.character(fold)]] = this_samples
+ dist_samples = append(dist_samples, this_samples)
+ fold = fold + 1
+ }
+
+ ##### ##### ##### ##### ##### ##### #####
+ # if any sample is not assigned yet, do it know until no sample is left
+ left_samples = setdiff(left_samples, dist_samples)
+ if (length(left_samples) > 0){
+ c = 1
+ for (i in left_samples){
+ folds_left_samples[[c]] = append(folds_left_samples[[c]], i)
+ c = ifelse(c == n_folds, 1, c + 1)
+ }
+ }
+ folds_left_samples_df = tibble()
+ for (i in names(folds_left_samples)){
+ t = tibble(fold = as.numeric(i), ID = folds_left_samples[[i]])
+ folds_left_samples_df = rbind(folds_left_samples_df, t)
+ }
+ final_id_fold = rbind(folds_left_samples_df,
+ sample_id_fold_assignment %>% select(ID, fold))
+ stopifnot(nrow(final_id_fold) == n_all_samples)
+
+
+ ##### ##### ##### ##### ##### ##### #####
+ ##### write k and k-1 folds as test and train samples for all k
+
+ out_dir = file.path(cv_splits_out_dir,sprintf('%d_fold', n_folds))
+
+ if (!file.exists(out_dir)) {
+ # If it doesn't exist, create the directory
+ dir.create(out_dir)
+ cat("Directory created:", out_dir, "\n")
+ } else {
+ cat("Directory already exists:", out_dir, "\n")
+ }
+ all_folds = unique(final_id_fold[['fold']])
+ # get samples
+ for (test_fold in all_folds){
+ train_folds = setdiff(all_folds, test_fold)
+ test_samples = filter(final_id_fold, fold == test_fold) %>% pull(ID)
+ train_samples = filter(final_id_fold, fold %in% train_folds) %>% pull(ID)
+ stopifnot(length(test_samples) + length(train_samples) == n_all_samples)
+ stopifnot(length(union(test_samples, train_samples)) == n_all_samples)
+
+ fold_idx = as.numeric(test_fold) -1
+ train_out_file = sprintf('%s/samples_train%s.pkl', out_dir, fold_idx)
+ print(sprintf('Writing train samples to %s', train_out_file))
+ py_save_object(train_samples, train_out_file, pickle = "pickle")
+
+ test_out_file = sprintf('%s/samples_test%s.pkl', out_dir, fold_idx)
+ print(sprintf('Writing test samples to %s', test_out_file))
+ py_save_object(test_samples, test_out_file, pickle = "pickle")
+ }
+ return(final_id_fold)
+}
+
+
+cv_splits_out_dir = '/omics/odcf/analysis/OE0540_projects/ukbb/exomes/vcf/preprocessed/cv_splits_eva/cv_splits_related_in_same_fold'
+
+assignSamplesToFolds(n_folds = 10, cv_splits_out_dir = cv_splits_out_dir)
+assignSamplesToFolds(n_folds = 5, cv_splits_out_dir = cv_splits_out_dir)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index df59222f..15f5830b 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,7 @@
"deeprvat_evaluate=deeprvat.deeprvat.evaluate:evaluate",
"seed_gene_pipeline=deeprvat.seed_gene_discovery.seed_gene_discovery:cli",
"seed_gene_evaluate=deeprvat.seed_gene_discovery.evaluate:cli",
+ "deeprvat_cv_utils=deeprvat.cv_utils:cli",
],
},
install_requires=requirements,