From 2c62beba297a14ee748bfbcf4c12bc708e3bb63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarcel-Mueck=E2=80=9D?= <“mueckm1@gmail.com”> Date: Fri, 13 Oct 2023 16:12:13 +0200 Subject: [PATCH] added merging of absplice ond deepsea pca to complete annotations in snakefile --- deeprvat/annotations/annotations.py | 61 ++----------------- pipelines/annotations.snakefile | 47 +++++++++----- .../config/deeprvat_annotation_config.yaml | 1 + 3 files changed, 39 insertions(+), 70 deletions(-) diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 39ffdfb9..534455f7 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -525,7 +525,7 @@ def deepsea_pca(n_components: int, deepsea_file: str, pca_pkl: str, out_dir: str logger.info(f"creating pca object and saving it to {pca_pkl}") pca = PCA(n_components=n_components) pca.fit(X_std) - with open(out_path / "pca.pkl", "wb") as f: + with open(pca_pkl, "wb") as f: pickle.dump(pca, f) logger.info(f"Projecting rows to {n_components} PCs") X_pca = pca.transform(X_std) @@ -548,51 +548,6 @@ def deepsea_pca(n_components: int, deepsea_file: str, pca_pkl: str, out_dir: str logger.info("Done") -@cli.command() -@click.option("--n-components", type=int, default=59) -@click.argument("deepripe-file", type=click.Path(exists=True)) -@click.argument("out-dir", type=click.Path(exists=True)) -def deepripe_pca(n_components: int, deepripe_file: str, out_dir: str): - logger.info("Reading deepripe file") - df = pd.read_csv(deepripe_file) - df = df.rename( - columns={ - "#CHROM": "chrom", - "POS": "pos", - "ID": "id", - "REF": "ref", - "ALT": "alt", - } - ) - df = df.drop(columns=["QUAL", "FILTER", "INFO"]) - key_df = df[["chrom", "pos", "ref", "alt", "id"]].reset_index(drop=True) - - logger.info("Extracting matrix for PCA") - X = df[[c for c in df.columns if c not in key_df.columns]].to_numpy() - del df - - X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0) - del X - - logger.info("Running PCA") - pca = PCA(n_components=n_components) - pca.fit(X_std) - out_path = Path(out_dir) - with open(out_path / "pca.pkl", "wb") as f: - pickle.dump(pca, f) - - logger.info(f"Projecting rows to {n_components} PCs") - X_pca = pca.transform(X_std) - del X_std - pca_df = pd.DataFrame( - X_pca, columns=[f"DeepRipe_PC_{i}" for i in range(1, n_components + 1)] - ) - del X_pca - pca_df = pd.concat([key_df, pca_df], axis=1) - pca_df.to_parquet(out_path / "deepripe_pca.parquet", engine="pyarrow") - - logger.info("Done") - @cli.command() @click.argument("variants-file", type=click.Path(exists=True)) @@ -714,10 +669,10 @@ def scorevariants_deepripe( for ix, RBP_name in enumerate(RBPnames): df_variants[RBP_name] = score_list[:, ix] print( - f"saving file to: {output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv" + f"saving file to: {output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv.gz" ) df_variants.to_csv( - f"{output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv", index=False + f"{output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv.gz", index=False ) @@ -1091,13 +1046,6 @@ def merge_annotations(vep_header_line:int, vep_file, header=vep_header_line, sep="\t", - # dtype={ - # "STRAND": str, - # "TSL": str, - # "GENE_PHENO": str, - # "CADD_PHRED": str, - # "CADD_RAW": str, - # }, ) vep_df = process_vep(vep_file=vep_df) logger.info(f"vep_df shape is {vep_df.shape}") @@ -1139,6 +1087,7 @@ def process_deepripe(deepripe_df:object, column_prefix:str)->object: prefix_cols = [x for x in deepripe_df.columns if x not in key_cols] new_names = [(i, i + f"_{column_prefix}") for i in prefix_cols] deepripe_df = deepripe_df.rename(columns=dict(new_names)) + deepripe_df.drop_duplicates(subset=["chrom", "pos", "ref", "alt"], inplace=True) return deepripe_df def process_vep(vep_file:object)->object: @@ -1152,7 +1101,7 @@ def process_vep(vep_file:object)->object: float_vals = ['DISTANCE', 'gnomADg_FIN_AF', 'AF', 'AFR_AF', 'AMR_AF','EAS_AF', 'EUR_AF', 'SAS_AF', 'MAX_AF','MOTIF_POS', 'MOTIF_SCORE_CHANGE', 'CADD_PHRED', 'CADD_RAW', 'PrimateAI', 'TSL', 'Condel'] vep_file[float_vals] = vep_file[float_vals].replace('-', 'NaN').astype(float) dummies = vep_file["Consequence"].str.get_dummies(",").add_prefix("Consequence_") - hopefully_all_consequences= ['Consequence_splice_acceptor_variant','Consequence_5_prime_UTR_variant','Consequence_TFBS_ablation','Consequence_start_lost','Consequence_incomplete_terminal_codon_variant','Consequence_intron_variant', 'Consequence_stop_gained', 'Consequence_splice_donor_5th_base_variant', 'Consequence_downstream_gene_variant', 'Consequence_splice_donor_variant', 'Consequence_protein_altering_variant', 'Consequence_splice_polypyrimidine_tract_variant', 'Consequence_inframe_insertion', 'Consequence_mature_miRNA_variant', 'Consequence_synonymous_variant', 'Consequence_regulatory_region_variant', 'Consequence_non_coding_transcript_exon_variant', 'Consequence_stop_lost', 'Consequence_TF_binding_site_variant', 'Consequence_splice_donor_region_variant', 'Consequence_stop_retained_variant', 'Consequence_splice_region_variant', 'Consequence_coding_sequence_variant', 'Consequence_upstream_gene_variant', 'Consequence_frameshift_variant', 'Consequence_start_retained_variant', 'Consequence_3_prime_UTR_variant', 'Consequence_inframe_deletion', 'Consequence_missense_variant', 'Consequence_non_coding_transcript_variant'] + hopefully_all_consequences= ['Consequence_splice_acceptor_variant','Consequence_5_prime_UTR_variant','Consequence_TFBS_ablation','Consequence_start_lost','Consequence_incomplete_terminal_codon_variant','Consequence_intron_variant', 'Consequence_stop_gained', 'Consequence_splice_donor_5th_base_variant', 'Consequence_downstream_gene_variant', 'Consequence_intergenic_variant', 'Consequence_splice_donor_variant','Consequence_NMD_transcript_variant', 'Consequence_protein_altering_variant', 'Consequence_splice_polypyrimidine_tract_variant', 'Consequence_inframe_insertion', 'Consequence_mature_miRNA_variant', 'Consequence_synonymous_variant', 'Consequence_regulatory_region_variant', 'Consequence_non_coding_transcript_exon_variant', 'Consequence_stop_lost', 'Consequence_TF_binding_site_variant', 'Consequence_splice_donor_region_variant', 'Consequence_stop_retained_variant', 'Consequence_splice_region_variant', 'Consequence_coding_sequence_variant', 'Consequence_upstream_gene_variant', 'Consequence_frameshift_variant', 'Consequence_start_retained_variant', 'Consequence_3_prime_UTR_variant', 'Consequence_inframe_deletion', 'Consequence_missense_variant', 'Consequence_non_coding_transcript_variant'] hopefully_all_consequences = list(set(hopefully_all_consequences)) mask = pd.DataFrame(data = np.zeros(shape= ( len(vep_file), len(hopefully_all_consequences))), columns=hopefully_all_consequences , dtype=float) mask[list(dummies.columns)]=dummies diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index 48804cb6..bd44868e 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -76,7 +76,7 @@ ncores_merge_absplice = int(config.get("n_cores_merge_absplice") or 64) n_jobs_deepripe = int(config.get("n_jobs_deepripe") or 8) # init kipoi-veff2 kipoi_repo_dir = Path(config["kipoiveff_repo_dir"]) -ncores_addis = int(config.get("n_jobs_deepripe") or 32) +ncores_addis = int(config.get("n_jobs_addids") or 32) # Filter out which chromosomes to work with pvcf_blocks_df = pvcf_blocks_df[ pvcf_blocks_df["Chromosome"].isin([str(c) for c in included_chromosomes]) @@ -97,7 +97,7 @@ block = pvcf_blocks_df["Block"] rule all: input: - anno_dir / "current_annotations.parquet", + anno_dir / "current_annotations_absplice.parquet", rule aggregate_and_merge_absplice: @@ -108,11 +108,9 @@ rule aggregate_and_merge_absplice: chr=chromosomes, block=block, ), - current_annotation_file=anno_dir - / "current_annotations_deepsea_deepripe_parclip_hg2_k5.parquet", + current_annotation_file=anno_dir / "vep_deepripe_deepsea.parquet" output: - annotations=anno_dir - / "current_annotations_absplice_deepsea_deepripe_parclip_hg2_k5.parquet", + annotations=anno_dir / "vcomplete_annotations.parquet.parquet" scores=anno_tmp_dir / "abSplice_score_file.parquet", shell: @@ -128,6 +126,23 @@ rule aggregate_and_merge_absplice: f"{ncores_merge_absplice}" ]) +rule merge_deepsea_pcas: + input: + annotations=anno_dir / "vep_deepripe.parquet" + deepsea_pcas=anno_dir / "deepSea_pca" / "deepsea_pca.parquet", + output: + anno_dir / "vep_deepripe_deepsea.parquet" + shell: + " ".join( + [ + "python", + f"{annotation_python_file}", + "merge-deepsea-pcas", + "{input.annotations}", + "{input.deepsea_pcas}", + "{output}", + ] + ) rule concat_annotations: input: @@ -138,14 +153,18 @@ rule concat_annotations: zip, chr=chromosomes, block=block) - output: anno_dir / "current_annotations.parquet" - shell: f"python " + - str(annotation_python_file) + " concat-annotations "+ - " {input.pvcf} "+ - " {input.anno_dir} "+ - f"{str(vcf_pattern+'_merged.parquet').format(chr='{{chr}}', block='{{block}}')} "+ - "{output}"+ - f" --included-chromosomes {','.join(included_chromosomes)}" + output: anno_dir / "vep_deepripe.parquet" + shell: + " ".join([ + "python", + str(annotation_python_file), + "concat-annotations", + "{input.pvcf}", + "{input.anno_dir}", + f"{str(vcf_pattern+'_merged.parquet').format(chr='{{chr}}', block='{{block}}')}", + "{output}", + f" --included-chromosomes {','.join(included_chromosomes)}" + ]) rule merge_annotations: input: diff --git a/pipelines/config/deeprvat_annotation_config.yaml b/pipelines/config/deeprvat_annotation_config.yaml index d201c148..51b7f2a3 100644 --- a/pipelines/config/deeprvat_annotation_config.yaml +++ b/pipelines/config/deeprvat_annotation_config.yaml @@ -32,3 +32,4 @@ pybedtools_tmp_path : output_dir/annotations/tmp/pybedtools n_jobs_deepripe : 32 n_cores_merge_absplice : 32 n_cores_absplice : 32 +deepsea_pca_pickle_filepath : annotations/deepSea_pca/pca.pkl \ No newline at end of file