add pineo umap analysis

rokitalab · Nov 11, 2024 · 7e1521d · 7e1521d
1 parent ce24b0d
commit 7e1521d
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 6 deletions.
diff --git a/.github/workflows/run_analysis.yml b/.github/workflows/run_analysis.yml
@@ -98,6 +98,7 @@ jobs:
 
           - name: Molecular Subtyping - PB
             entrypoint: molecular-subtyping-PB/run-molecular-subtyping-PB.sh
+            openpbta_subset: 0
 
           - name: Molecular Subtyping - NBL
             entrypoint: molecular-subtyping-NBL/run-molecular-subtyping-NBL.sh

diff --git a/analyses/molecular-subtyping-PB/01-molecular-subtype-pineoblastoma.html b/analyses/molecular-subtyping-PB/01-molecular-subtype-pineoblastoma.html
@@ -360,13 +360,13 @@ <h4 class="date">2023-12-06</h4>
 <div id="load-library" class="section level2">
 <h2>load library</h2>
 <pre class="r"><code>library(tidyverse)</code></pre>
-<pre><code>## ── Attaching core tidyverse packages ──────────────────────────────────── tidyverse 2.0.0 ──
+<pre><code>## ── Attaching core tidyverse packages ─────────────────────────────────────────────── tidyverse 2.0.0 ──
 ## ✔ dplyr     1.1.4     ✔ readr     2.1.5
 ## ✔ forcats   1.0.0     ✔ stringr   1.5.1
 ## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
 ## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
 ## ✔ purrr     1.0.2     
-## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
+## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 ## ✖ dplyr::filter() masks stats::filter()
 ## ✖ dplyr::lag()    masks stats::lag()
 ## ℹ Use the conflicted package (&lt;http://conflicted.r-lib.org/&gt;) to force all conflicts to become errors</code></pre>
@@ -385,7 +385,7 @@ <h2>set directories</h2>
 <h2>read files</h2>
 <pre class="r"><code>histo &lt;- readr::read_tsv(file.path(data_dir, &quot;histologies-base.tsv&quot;))</code></pre>
 <pre><code>## Rows: 47895 Columns: 64
-## ── Column specification ────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
 ## Delimiter: &quot;\t&quot;
 ## chr (41): Kids_First_Participant_ID, Kids_First_Biospecimen_ID, sample_id, a...
 ## dbl (21): cell_line_passage, OS_days, EFS_days, age_at_diagnosis_days, age_a...

diff --git a/analyses/molecular-subtyping-PB/02-pineoblastoma-umap.Rmd b/analyses/molecular-subtyping-PB/02-pineoblastoma-umap.Rmd
@@ -0,0 +1,124 @@
+---
+title: "Pineoblastoma UMAP"
+author: "Jo Lynne Rokita"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+Load libraries and set directory paths
+
+```{r}
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(umap)
+  library(ggplot2)
+  library(devtools)
+  library(gdata)
+  library(ggpubr)
+  library(patchwork)
+})
+
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+data_dir <- file.path(root_dir, "data")
+analysis_dir <- file.path(root_dir, "analyses", "molecular-subtyping-PB")
+results_dir <- file.path(analysis_dir, "results")
+plots_dir <- file.path(analysis_dir, "plot")
+
+source(file.path(root_dir, "figures", "manuscript_OPC", "utils", "theme_for_plot.R"))
+```
+
+Set file paths
+
+```{r}
+hist_file <- file.path(data_dir, "histologies-base.tsv")
+subtype_file <- file.path(results_dir, "pineo-molecular-subtypes.tsv")
+methyl_file <- file.path(data_dir, "methyl-beta-values.rds")
+
+```
+
+Wrangle data. 
+
+```{r get methyl ids}
+hist <- read_tsv(hist_file)
+```
+
+Filter hist for methyl samples, and append to subtype df
+
+```{r}
+hist_methyl <- hist %>%
+  dplyr::filter(pathology_diagnosis == "Pineoblastoma",
+                experimental_strategy == "Methylation") 
+
+subtypes <- read_tsv(subtype_file) %>%
+  right_join(hist_methyl) %>%
+  filter(!is.na(molecular_subtype_methyl)) %>%
+  dplyr::rename(Kids_First_Biospecimen_ID_methyl = Kids_First_Biospecimen_ID) %>%
+  # remove anything not PB and also the to be classified sample
+  # update one with germline pathogenic variant (GPV) reported in Fiorca, et al 2024
+  mutate(molecular_subtype = case_when(Kids_First_Biospecimen_ID_methyl == "BS_SG2X2XQB" ~ paste0(molecular_subtype, " + DROSHA GPV"),
+         TRUE ~ molecular_subtype),
+         molecular_subtype_methyl = case_when(Kids_First_Biospecimen_ID_methyl == "BS_SG2X2XQB" ~ paste0(molecular_subtype_methyl, " + DROSHA GPV"),
+         TRUE ~ molecular_subtype_methyl))
+
+```
+
+Get number of samples by MB SHH subtype
+
+```{r}
+table(hist_methyl$molecular_subtype)
+```
+
+Load methylation data and filter for ids in `mb_shh_subtypes`
+
+```{r load methyl}
+methyl <- readRDS(methyl_file)
+
+pineo_methyl <- methyl[,colnames(methyl) %in% c("Probe_ID", subtypes$Kids_First_Biospecimen_ID_methyl)]
+
+pineo_methyl <- pineo_methyl %>%
+  distinct(Probe_ID, .keep_all = TRUE) %>%
+  column_to_rownames("Probe_ID")
+```
+
+Identify 20k most variable probes among MB samples
+
+```{r}
+methyl_var <- apply(pineo_methyl, 1, var, na.rm = TRUE)
+var_probes <- names(sort(methyl_var, decreasing = TRUE)[1:20000])
+```
+
+Generate UMAP results
+
+```{r}
+set.seed(2024)
+# neighbors needs to be low because the sample size is very low right now
+umap_results <- umap::umap(t(pineo_methyl[var_probes, ]), n_neighbors = 5)
+
+umap_plot_df <- data.frame(umap_results$layout) %>%
+  tibble::rownames_to_column("Kids_First_Biospecimen_ID_methyl") %>%
+  left_join(subtypes)
+```
+
+Plot UMAP with molecular subtype and age range 
+
+```{r}
+umap_pineo <-  ggplot(umap_plot_df, aes(x = X1, 
+             y = X2,
+             fill = molecular_subtype_methyl)) +
+  geom_point(alpha = 0.8, size = 3.5, shape = 21, stroke = 0.8, color = "black") +
+  labs(fill = "Molecular subtype") +
+  theme_bw() +
+  xlab("UMAP1") +
+  ylab("UMAP2") +
+  theme_Publication() 
+
+ggsave(file.path(plots_dir, "umap_pineo.pdf"),
+       umap_pineo,
+       width = 7, height = 3.5)
+```
+
+Save session info
+```{r}
+sessionInfo()
+```
diff --git a/analyses/molecular-subtyping-PB/README.md b/analyses/molecular-subtyping-PB/README.md
@@ -20,6 +20,8 @@ bash run-molecular-subtyping-PB.sh
 
 This folder contains scripts tasked to molecularly subtype PB samples 
 
+`00-PB-select-pathology-dx.R` - filter to pineoblastoma diagnoses
+`01-molecular-subtype-pineoblastoma.Rmd` - Subtype pineoblastoma:
 * Filter the samples with `dkfz_v12_methylation_subclass >=0.8` and `cns_methylation_subclass` is one of the three PB subtypes ->
   * PB_FOXR2: pineoblastoma, MYC/FOXR2-activated (PB, MYC/FOXR2)
   * PB_GRP1A, PB_GRP1B: pineoblastoma, miRNA processing-altered 1 (PB, Group 1)
@@ -28,4 +30,7 @@ This folder contains scripts tasked to molecularly subtype PB samples
 
 * If methylation does not exist or `dkfz_v12_methylation_subclass < 0.8`  for any PB samples -> `PB, To be clasified`
 
-Final results is a table with `Kids_First_Biospecimen_ID`, `Kids_First_Participant_ID`, `sample_id`, `molecular_subtype_methyl`, and `molecular_subtype`, and saved as `pb-molecular-subtype.tsv`
+Final results is a table with `Kids_First_Biospecimen_ID`, `Kids_First_Participant_ID`, `sample_id`, `molecular_subtype_methyl`, and `molecular_subtype`, and saved as `pb-molecular-subtype.tsv`.
+
+`02-pineoblastoma-umap.Rmd` - Create umap of top 20k most variable methylation probes and plot the pineoblastoma subtypes.
+
diff --git a/analyses/molecular-subtyping-PB/run-molecular-subtyping-PB.sh b/analyses/molecular-subtyping-PB/run-molecular-subtyping-PB.sh
@@ -3,11 +3,49 @@
 set -e
 set -o pipefail
 
-# set up running directory
-cd "$(dirname "${BASH_SOURCE[0]}")" 
+# This script should always run as if it were being called from
+# the directory it lives in.
+script_directory="$(perl -e 'use File::Basename;
+  use Cwd "abs_path";
+  print dirname(abs_path(@ARGV[0]));' -- "$0")"
+cd "$script_directory" || exit
+
+# This will be turned off in CI
+SUBSET=${OPENPBTA_SUBSET:-1}
+
+scratch_path="../../scratch/"
+data_dir="../../data"
+
+
 
 # Run R script to generate JSON file
 Rscript --vanilla 00-PB-select-pathology-dx.R
 
 # Run R script to subtype PB using methylation data 
 Rscript -e "rmarkdown::render('01-molecular-subtype-pineoblastoma.Rmd', clean = TRUE)"
+
+
+if [ "$SUBSET" -gt "0" ]; then
+  echo "check whether methylation files exist"
+  URL="https://d3b-openaccess-us-east-1-prd-pbta.s3.amazonaws.com/open-targets"
+  RELEASE="v15"
+  BETA="methyl-beta-values.rds"
+
+  if [ -f "${data_dir}/${BETA}" ]; then
+      echo "${BETA} exists, skip downloading"
+      echo "run pineoblastoma clustering"
+      # add umap
+      Rscript -e "rmarkdown::render('02-pineoblastoma-umap.Rmd', clean = TRUE)"
+    else 
+      echo "${BETA} does not exist, downloading..."
+      wget ${URL}/${RELEASE}/${BETA} -P ${data_dir}/${RELEASE}/
+      cd ${data_dir}
+      ln -sfn ${RELEASE}/${BETA} ./${BETA}
+      cd ../analyses/molecular-subtyping-PB
+      echo "run pineoblastoma clustering"
+      # add umap
+      Rscript -e "rmarkdown::render('02-pineoblastoma-umap.Rmd', clean = TRUE)"
+  fi
+else
+  echo "SUBSET is not greater than 0 or not a valid number."
+fi