Skip to content

Commit

Permalink
Small changes
Browse files Browse the repository at this point in the history
  • Loading branch information
“Marcel-Mueck” committed Sep 28, 2023
1 parent 530da1f commit a85e404
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 9 deletions.
30 changes: 23 additions & 7 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ def deepripe_score_variant_onlyseq_all(
bedline, genomefasta, flank_size=(seq_len // 2) + 2
) for bedline in variant_bed)
encoded_seqs_list = [(x if x is not None
else np.ones(encoded_seqs_list[0].shape) * float("nan"))
else np.ones((2, seq_len + 4, 4)) * float("nan"))
for x in encoded_seqs_list]
encoded_seqs = tf.concat(encoded_seqs_list, 0)

Expand Down Expand Up @@ -895,9 +895,11 @@ def merge_deepripe(
key_cols = ["chrom", "pos", "ref", "alt", "id"]
prefix_cols = [x for x in deepripe_df.columns if x not in key_cols]
new_names = [(i, i + f"_{column_prefix}") for i in prefix_cols]
deepripe_df.rename(columns=dict(new_names))

deepripe_df = deepripe_df.rename(columns=dict(new_names))
logger.info(deepripe_df.columns)
merged = annotations.merge(
deepripe_df, how="left", on=["chrom", "pos", "ref", "alt"]
deepripe_df, how="left", on=["chrom", "pos", "ref", "alt", "id"]
)
assert len(merged) == orig_len
merged.to_parquet(out_file)
Expand Down Expand Up @@ -958,15 +960,29 @@ def add_ids(annotation_file: str, variant_file: str, out_file: str):

logger.info("Sanity check of file IDs")
all_variants = pd.read_csv(variant_file, sep="\t")
df.drop_duplicates(subset=key_cols, inplace = True)
df_shape = df.shape
df = pd.merge(all_variants, df, on=key_cols, how="left")
# sanity checks
# same length as variant file
assert df.shape[0] == all_variants.shape[0]
try:
assert df.shape[0] == all_variants.shape[0]
except AssertionError:
logger.error(f"df.shape[0] was {df.shape[0]} but all_variants.shape[0] was {all_variants.shape[0]}")
raise AssertionError
# one more column as annotation file
assert df.shape[1] == df_shape[1] + 1
try:
assert df.shape[1] == df_shape[1] + 1
except AssertionError:
logger.error(f"df.shape[1] was {df.shape[1]} but df_shape[1] + 1 was {df_shape[1] + 1}")
raise AssertionError
# all key columns and id are present
assert all([x in df.columns for x in key_cols + ["id"]])
try:
assert all([x in df.columns for x in key_cols + ["id"]])
except AssertionError:
logger.error("not all key cols in df.columns")
logger.info(df.columns)
raise AssertionError
df.to_csv(out_file, index=False)


Expand Down Expand Up @@ -1183,7 +1199,7 @@ def concatenate_annotations(
vep_file = vep_file[vep_file.BIOTYPE == "protein_coding"]
logger.info("splitting variant name")
vep_file[["chrom", "pos", "ref", "alt"]] = vep_file["Uploaded_variation"].str.split(
"_", expand=True
":", expand=True
)
vep_file["pos"] = vep_file["pos"].astype(int)

Expand Down
4 changes: 2 additions & 2 deletions deeprvat_annotations.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
name: deeprvat_annotations
name: deeprvat_annotations_2
channels:
- conda-forge
- defaults
- bioconda
dependencies:
- python=3.9.16
- click=8.0.4
- scikit-learn=0.23.2
- scikit-learn=1.0.2
- dask=2021.5.0
- tqdm=4.59.0
- pybedtools=0.9.0
Expand Down
92 changes: 92 additions & 0 deletions pipelines/resources/absplice_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# ============================= AbSplice-DNA related ================================

# specify genome version hg19 or hg38
genome: hg38

# store all the vcf file that you want to analyze to directory ../data/resources/vcf_files/
vcf: ../data/resources/analysis_files/vcf_files/{vcf_id}

# parameters for variant filtering
variant_filtering:
minor_allele_frequency:
apply_filter: False
filter_cutoff: 0.001
max_num_samples:
apply_filter: False
filter_cutoff: 2

# False by default, you can change this if you want to run on large datasets (see Readme)
use_rocksdb: True

splicemap_tissues:
- Adipose_Subcutaneous
- Adipose_Visceral_Omentum
- Adrenal_Gland
- Artery_Aorta
- Artery_Coronary
- Artery_Tibial
- Brain_Amygdala
- Brain_Anterior_cingulate_cortex_BA24
- Brain_Caudate_basal_ganglia
- Brain_Cerebellar_Hemisphere
- Brain_Cerebellum
- Brain_Cortex
- Brain_Frontal_Cortex_BA9
- Brain_Hippocampus
- Brain_Hypothalamus
- Brain_Nucleus_accumbens_basal_ganglia
- Brain_Putamen_basal_ganglia
- Brain_Spinal_cord_cervical_c_1
- Brain_Substantia_nigra
- Breast_Mammary_Tissue
- Cells_Cultured_fibroblasts
- Cells_EBV_transformed_lymphocytes
- Colon_Sigmoid
- Colon_Transverse
- Esophagus_Gastroesophageal_Junction
- Esophagus_Mucosa
- Esophagus_Muscularis
- Heart_Atrial_Appendage
- Heart_Left_Ventricle
- Kidney_Cortex
- Liver
- Lung
- Minor_Salivary_Gland
- Muscle_Skeletal
- Nerve_Tibial
- Ovary
- Pancreas
- Pituitary
- Prostate
- Skin_Not_Sun_Exposed_Suprapubic
- Skin_Sun_Exposed_Lower_leg
- Small_Intestine_Terminal_Ileum
- Spleen
- Stomach
- Testis
- Thyroid
- Uterus
- Vagina
- Whole_Blood


# ============================= AbSplice-RNA related ================================

AbSplice_RNA: False

DROP:
geneAnnotation: 'v29'
DROP_group:
- 'Cells_Cultured_fibroblasts'
working_dir: ../data/resources/analysis_files/absplice_rna_related_files/DROP/
sample_annotation: processed_data/aberrant_splicing/annotations/{tissue_cat}.tsv
count_table: processed_data/aberrant_splicing/datasets/
outliers: processed_results/aberrant_splicing/results/{geneAnnotation}/


cat_outlier_filtering:
padjustGene_cutoff: 0.1
padjust_junction_cutoff: 0.05
totalCounts_cutoff: 20
delta_psi_cutoff: 0.3
outlier_type: 'psi5__psi3__theta'

0 comments on commit a85e404

Please sign in to comment.