Small changes

PMBio · Sep 28, 2023 · a85e404 · a85e404
1 parent 530da1f
commit a85e404
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 9 deletions.
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -466,7 +466,7 @@ def deepripe_score_variant_onlyseq_all(
             bedline, genomefasta, flank_size=(seq_len // 2) + 2
         ) for bedline in variant_bed)
     encoded_seqs_list = [(x if x is not None
-                          else np.ones(encoded_seqs_list[0].shape) * float("nan"))
+                          else np.ones((2, seq_len + 4, 4)) * float("nan"))
                          for x in encoded_seqs_list]
     encoded_seqs = tf.concat(encoded_seqs_list, 0)
 
@@ -895,9 +895,11 @@ def merge_deepripe(
     key_cols = ["chrom", "pos", "ref", "alt", "id"]
     prefix_cols = [x for x in deepripe_df.columns if x not in key_cols]
     new_names = [(i, i + f"_{column_prefix}") for i in prefix_cols]
-    deepripe_df.rename(columns=dict(new_names))
+
+    deepripe_df = deepripe_df.rename(columns=dict(new_names))
+    logger.info(deepripe_df.columns)
     merged = annotations.merge(
-        deepripe_df, how="left", on=["chrom", "pos", "ref", "alt"]
+        deepripe_df, how="left", on=["chrom", "pos", "ref", "alt", "id"]
     )
     assert len(merged) == orig_len
     merged.to_parquet(out_file)
@@ -958,15 +960,29 @@ def add_ids(annotation_file: str, variant_file: str, out_file: str):
 
     logger.info("Sanity check of  file IDs")
     all_variants = pd.read_csv(variant_file, sep="\t")
+    df.drop_duplicates(subset=key_cols, inplace = True)
     df_shape = df.shape
     df = pd.merge(all_variants, df, on=key_cols, how="left")
     # sanity checks
     # same length as variant file
-    assert df.shape[0] == all_variants.shape[0]
+    try: 
+        assert df.shape[0] == all_variants.shape[0]
+    except AssertionError: 
+        logger.error(f"df.shape[0] was {df.shape[0]} but all_variants.shape[0] was {all_variants.shape[0]}")
+        raise AssertionError
     # one more column as annotation file
-    assert df.shape[1] == df_shape[1] + 1
+    try:
+        assert df.shape[1] == df_shape[1] + 1
+    except AssertionError:
+        logger.error(f"df.shape[1] was {df.shape[1]} but df_shape[1] + 1 was {df_shape[1] + 1}")
+        raise AssertionError       
     # all key columns and id are present
-    assert all([x in df.columns for x in key_cols + ["id"]])
+    try:
+        assert all([x in df.columns for x in key_cols + ["id"]])
+    except AssertionError:    
+        logger.error("not all key cols in df.columns")
+        logger.info(df.columns)
+        raise AssertionError          
     df.to_csv(out_file, index=False)
 
 
@@ -1183,7 +1199,7 @@ def concatenate_annotations(
     vep_file = vep_file[vep_file.BIOTYPE == "protein_coding"]
     logger.info("splitting variant name")
     vep_file[["chrom", "pos", "ref", "alt"]] = vep_file["Uploaded_variation"].str.split(
-        "_", expand=True
+        ":", expand=True
     )
     vep_file["pos"] = vep_file["pos"].astype(int)
 

diff --git a/deeprvat_annotations.yml b/deeprvat_annotations.yml
@@ -1,12 +1,12 @@
-name: deeprvat_annotations
+name: deeprvat_annotations_2
 channels:
   - conda-forge
   - defaults
   - bioconda
 dependencies:
   - python=3.9.16
   - click=8.0.4
-  - scikit-learn=0.23.2
+  - scikit-learn=1.0.2
   - dask=2021.5.0
   - tqdm=4.59.0
   - pybedtools=0.9.0

diff --git a/pipelines/resources/absplice_config.yaml b/pipelines/resources/absplice_config.yaml
@@ -0,0 +1,92 @@
+# ============================= AbSplice-DNA related ================================
+
+# specify genome version hg19 or hg38 
+genome: hg38
+
+# store all the vcf file that you want to analyze to directory ../data/resources/vcf_files/
+vcf: ../data/resources/analysis_files/vcf_files/{vcf_id}
+
+# parameters for variant filtering
+variant_filtering:
+  minor_allele_frequency: 
+    apply_filter: False
+    filter_cutoff: 0.001
+  max_num_samples: 
+    apply_filter: False
+    filter_cutoff: 2
+
+# False by default, you can change this if you want to run on large datasets (see Readme)
+use_rocksdb: True
+
+splicemap_tissues:
+  - Adipose_Subcutaneous
+  - Adipose_Visceral_Omentum
+  - Adrenal_Gland
+  - Artery_Aorta
+  - Artery_Coronary
+  - Artery_Tibial
+  - Brain_Amygdala
+  - Brain_Anterior_cingulate_cortex_BA24
+  - Brain_Caudate_basal_ganglia
+  - Brain_Cerebellar_Hemisphere
+  - Brain_Cerebellum
+  - Brain_Cortex
+  - Brain_Frontal_Cortex_BA9
+  - Brain_Hippocampus
+  - Brain_Hypothalamus
+  - Brain_Nucleus_accumbens_basal_ganglia
+  - Brain_Putamen_basal_ganglia
+  - Brain_Spinal_cord_cervical_c_1
+  - Brain_Substantia_nigra
+  - Breast_Mammary_Tissue
+  - Cells_Cultured_fibroblasts
+  - Cells_EBV_transformed_lymphocytes
+  - Colon_Sigmoid
+  - Colon_Transverse
+  - Esophagus_Gastroesophageal_Junction
+  - Esophagus_Mucosa
+  - Esophagus_Muscularis
+  - Heart_Atrial_Appendage
+  - Heart_Left_Ventricle
+  - Kidney_Cortex
+  - Liver
+  - Lung
+  - Minor_Salivary_Gland
+  - Muscle_Skeletal
+  - Nerve_Tibial
+  - Ovary
+  - Pancreas
+  - Pituitary
+  - Prostate
+  - Skin_Not_Sun_Exposed_Suprapubic
+  - Skin_Sun_Exposed_Lower_leg
+  - Small_Intestine_Terminal_Ileum
+  - Spleen
+  - Stomach
+  - Testis
+  - Thyroid
+  - Uterus
+  - Vagina
+  - Whole_Blood
+
+
+# ============================= AbSplice-RNA related ================================
+
+AbSplice_RNA: False
+
+DROP:
+    geneAnnotation: 'v29'
+    DROP_group: 
+      - 'Cells_Cultured_fibroblasts'
+    working_dir: ../data/resources/analysis_files/absplice_rna_related_files/DROP/
+    sample_annotation: processed_data/aberrant_splicing/annotations/{tissue_cat}.tsv
+    count_table: processed_data/aberrant_splicing/datasets/
+    outliers: processed_results/aberrant_splicing/results/{geneAnnotation}/
+
+
+cat_outlier_filtering:
+  padjustGene_cutoff: 0.1
+  padjust_junction_cutoff: 0.05
+  totalCounts_cutoff: 20
+  delta_psi_cutoff: 0.3
+  outlier_type: 'psi5__psi3__theta'