Skip to content

Commit

Permalink
Merge branch 'deepripe-batch' of https://github.com/PMBio/deeprvat in…
Browse files Browse the repository at this point in the history
…to deepripe-batch
  • Loading branch information
endast committed Oct 13, 2023
2 parents 11c69a5 + 138958b commit d9b303b
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 70 deletions.
61 changes: 5 additions & 56 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ def deepsea_pca(n_components: int, deepsea_file: str, pca_pkl: str, out_dir: str
logger.info(f"creating pca object and saving it to {pca_pkl}")
pca = PCA(n_components=n_components)
pca.fit(X_std)
with open(out_path / "pca.pkl", "wb") as f:
with open(pca_pkl, "wb") as f:
pickle.dump(pca, f)
logger.info(f"Projecting rows to {n_components} PCs")
X_pca = pca.transform(X_std)
Expand All @@ -548,51 +548,6 @@ def deepsea_pca(n_components: int, deepsea_file: str, pca_pkl: str, out_dir: str
logger.info("Done")


@cli.command()
@click.option("--n-components", type=int, default=59)
@click.argument("deepripe-file", type=click.Path(exists=True))
@click.argument("out-dir", type=click.Path(exists=True))
def deepripe_pca(n_components: int, deepripe_file: str, out_dir: str):
logger.info("Reading deepripe file")
df = pd.read_csv(deepripe_file)
df = df.rename(
columns={
"#CHROM": "chrom",
"POS": "pos",
"ID": "id",
"REF": "ref",
"ALT": "alt",
}
)
df = df.drop(columns=["QUAL", "FILTER", "INFO"])
key_df = df[["chrom", "pos", "ref", "alt", "id"]].reset_index(drop=True)

logger.info("Extracting matrix for PCA")
X = df[[c for c in df.columns if c not in key_df.columns]].to_numpy()
del df

X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
del X

logger.info("Running PCA")
pca = PCA(n_components=n_components)
pca.fit(X_std)
out_path = Path(out_dir)
with open(out_path / "pca.pkl", "wb") as f:
pickle.dump(pca, f)

logger.info(f"Projecting rows to {n_components} PCs")
X_pca = pca.transform(X_std)
del X_std
pca_df = pd.DataFrame(
X_pca, columns=[f"DeepRipe_PC_{i}" for i in range(1, n_components + 1)]
)
del X_pca
pca_df = pd.concat([key_df, pca_df], axis=1)
pca_df.to_parquet(out_path / "deepripe_pca.parquet", engine="pyarrow")

logger.info("Done")


@cli.command()
@click.argument("variants-file", type=click.Path(exists=True))
Expand Down Expand Up @@ -714,10 +669,10 @@ def scorevariants_deepripe(
for ix, RBP_name in enumerate(RBPnames):
df_variants[RBP_name] = score_list[:, ix]
print(
f"saving file to: {output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv"
f"saving file to: {output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv.gz"
)
df_variants.to_csv(
f"{output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv", index=False
f"{output_dir}/{file_name[:-3]}{saved_model_type}_deepripe.csv.gz", index=False
)


Expand Down Expand Up @@ -1091,13 +1046,6 @@ def merge_annotations(vep_header_line:int,
vep_file,
header=vep_header_line,
sep="\t",
# dtype={
# "STRAND": str,
# "TSL": str,
# "GENE_PHENO": str,
# "CADD_PHRED": str,
# "CADD_RAW": str,
# },
)
vep_df = process_vep(vep_file=vep_df)
logger.info(f"vep_df shape is {vep_df.shape}")
Expand Down Expand Up @@ -1139,6 +1087,7 @@ def process_deepripe(deepripe_df:object, column_prefix:str)->object:
prefix_cols = [x for x in deepripe_df.columns if x not in key_cols]
new_names = [(i, i + f"_{column_prefix}") for i in prefix_cols]
deepripe_df = deepripe_df.rename(columns=dict(new_names))
deepripe_df.drop_duplicates(subset=["chrom", "pos", "ref", "alt"], inplace=True)
return deepripe_df

def process_vep(vep_file:object)->object:
Expand All @@ -1152,7 +1101,7 @@ def process_vep(vep_file:object)->object:
float_vals = ['DISTANCE', 'gnomADg_FIN_AF', 'AF', 'AFR_AF', 'AMR_AF','EAS_AF', 'EUR_AF', 'SAS_AF', 'MAX_AF','MOTIF_POS', 'MOTIF_SCORE_CHANGE', 'CADD_PHRED', 'CADD_RAW', 'PrimateAI', 'TSL', 'Condel']
vep_file[float_vals] = vep_file[float_vals].replace('-', 'NaN').astype(float)
dummies = vep_file["Consequence"].str.get_dummies(",").add_prefix("Consequence_")
hopefully_all_consequences= ['Consequence_splice_acceptor_variant','Consequence_5_prime_UTR_variant','Consequence_TFBS_ablation','Consequence_start_lost','Consequence_incomplete_terminal_codon_variant','Consequence_intron_variant', 'Consequence_stop_gained', 'Consequence_splice_donor_5th_base_variant', 'Consequence_downstream_gene_variant', 'Consequence_splice_donor_variant', 'Consequence_protein_altering_variant', 'Consequence_splice_polypyrimidine_tract_variant', 'Consequence_inframe_insertion', 'Consequence_mature_miRNA_variant', 'Consequence_synonymous_variant', 'Consequence_regulatory_region_variant', 'Consequence_non_coding_transcript_exon_variant', 'Consequence_stop_lost', 'Consequence_TF_binding_site_variant', 'Consequence_splice_donor_region_variant', 'Consequence_stop_retained_variant', 'Consequence_splice_region_variant', 'Consequence_coding_sequence_variant', 'Consequence_upstream_gene_variant', 'Consequence_frameshift_variant', 'Consequence_start_retained_variant', 'Consequence_3_prime_UTR_variant', 'Consequence_inframe_deletion', 'Consequence_missense_variant', 'Consequence_non_coding_transcript_variant']
hopefully_all_consequences= ['Consequence_splice_acceptor_variant','Consequence_5_prime_UTR_variant','Consequence_TFBS_ablation','Consequence_start_lost','Consequence_incomplete_terminal_codon_variant','Consequence_intron_variant', 'Consequence_stop_gained', 'Consequence_splice_donor_5th_base_variant', 'Consequence_downstream_gene_variant', 'Consequence_intergenic_variant', 'Consequence_splice_donor_variant','Consequence_NMD_transcript_variant', 'Consequence_protein_altering_variant', 'Consequence_splice_polypyrimidine_tract_variant', 'Consequence_inframe_insertion', 'Consequence_mature_miRNA_variant', 'Consequence_synonymous_variant', 'Consequence_regulatory_region_variant', 'Consequence_non_coding_transcript_exon_variant', 'Consequence_stop_lost', 'Consequence_TF_binding_site_variant', 'Consequence_splice_donor_region_variant', 'Consequence_stop_retained_variant', 'Consequence_splice_region_variant', 'Consequence_coding_sequence_variant', 'Consequence_upstream_gene_variant', 'Consequence_frameshift_variant', 'Consequence_start_retained_variant', 'Consequence_3_prime_UTR_variant', 'Consequence_inframe_deletion', 'Consequence_missense_variant', 'Consequence_non_coding_transcript_variant']
hopefully_all_consequences = list(set(hopefully_all_consequences))
mask = pd.DataFrame(data = np.zeros(shape= ( len(vep_file), len(hopefully_all_consequences))), columns=hopefully_all_consequences , dtype=float)
mask[list(dummies.columns)]=dummies
Expand Down
47 changes: 33 additions & 14 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ ncores_merge_absplice = int(config.get("n_cores_merge_absplice") or 64)
n_jobs_deepripe = int(config.get("n_jobs_deepripe") or 8)
# init kipoi-veff2
kipoi_repo_dir = Path(config["kipoiveff_repo_dir"])
ncores_addis = int(config.get("n_jobs_deepripe") or 32)
ncores_addis = int(config.get("n_jobs_addids") or 32)
# Filter out which chromosomes to work with
pvcf_blocks_df = pvcf_blocks_df[
pvcf_blocks_df["Chromosome"].isin([str(c) for c in included_chromosomes])
Expand All @@ -98,7 +98,7 @@ block = pvcf_blocks_df["Block"]

rule all:
input:
anno_dir / "current_annotations.parquet",
anno_dir / "current_annotations_absplice.parquet",


rule aggregate_and_merge_absplice:
Expand All @@ -109,11 +109,9 @@ rule aggregate_and_merge_absplice:
chr=chromosomes,
block=block,
),
current_annotation_file=anno_dir
/ "current_annotations_deepsea_deepripe_parclip_hg2_k5.parquet",
current_annotation_file=anno_dir / "vep_deepripe_deepsea.parquet"
output:
annotations=anno_dir
/ "current_annotations_absplice_deepsea_deepripe_parclip_hg2_k5.parquet",
annotations=anno_dir / "vcomplete_annotations.parquet.parquet"
scores=anno_tmp_dir / "abSplice_score_file.parquet",

shell:
Expand All @@ -129,6 +127,23 @@ rule aggregate_and_merge_absplice:
f"{ncores_merge_absplice}"
])

rule merge_deepsea_pcas:
input:
annotations=anno_dir / "vep_deepripe.parquet"
deepsea_pcas=anno_dir / "deepSea_pca" / "deepsea_pca.parquet",
output:
anno_dir / "vep_deepripe_deepsea.parquet"
shell:
" ".join(
[
"python",
f"{annotation_python_file}",
"merge-deepsea-pcas",
"{input.annotations}",
"{input.deepsea_pcas}",
"{output}",
]
)

rule concat_annotations:
input:
Expand All @@ -139,14 +154,18 @@ rule concat_annotations:
zip,
chr=chromosomes,
block=block)
output: anno_dir / "current_annotations.parquet"
shell: f"python " +
str(annotation_python_file) + " concat-annotations "+
" {input.pvcf} "+
" {input.anno_dir} "+
f"{str(vcf_pattern+'_merged.parquet').format(chr='{{chr}}', block='{{block}}')} "+
"{output}"+
f" --included-chromosomes {','.join(included_chromosomes)}"
output: anno_dir / "vep_deepripe.parquet"
shell:
" ".join([
"python",
str(annotation_python_file),
"concat-annotations",
"{input.pvcf}",
"{input.anno_dir}",
f"{str(vcf_pattern+'_merged.parquet').format(chr='{{chr}}', block='{{block}}')}",
"{output}",
f" --included-chromosomes {','.join(included_chromosomes)}"
])

rule merge_annotations:
input:
Expand Down
1 change: 1 addition & 0 deletions pipelines/config/deeprvat_annotation_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ pybedtools_tmp_path : output_dir/annotations/tmp/pybedtools
n_jobs_deepripe : 32
n_cores_merge_absplice : 32
n_cores_absplice : 32
deepsea_pca_pickle_filepath : annotations/deepSea_pca/pca.pkl

0 comments on commit d9b303b

Please sign in to comment.