diff --git a/.github/workflows/run_analysis.yml b/.github/workflows/run_analysis.yml index 7e222c9975..fe68b1b624 100644 --- a/.github/workflows/run_analysis.yml +++ b/.github/workflows/run_analysis.yml @@ -98,6 +98,7 @@ jobs: - name: Molecular Subtyping - PB entrypoint: molecular-subtyping-PB/run-molecular-subtyping-PB.sh + openpbta_subset: 0 - name: Molecular Subtyping - NBL entrypoint: molecular-subtyping-NBL/run-molecular-subtyping-NBL.sh diff --git a/analyses/molecular-subtyping-PB/01-molecular-subtype-pineoblastoma.html b/analyses/molecular-subtyping-PB/01-molecular-subtype-pineoblastoma.html index 4030b8f4e1..6495ad22bb 100644 --- a/analyses/molecular-subtyping-PB/01-molecular-subtype-pineoblastoma.html +++ b/analyses/molecular-subtyping-PB/01-molecular-subtype-pineoblastoma.html @@ -360,13 +360,13 @@
library(tidyverse)
-## ── Attaching core tidyverse packages ──────────────────────────────────── tidyverse 2.0.0 ──
+## ── Attaching core tidyverse packages ─────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
-## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
+## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
@@ -385,7 +385,7 @@ set directories
read files
histo <- readr::read_tsv(file.path(data_dir, "histologies-base.tsv"))
## Rows: 47895 Columns: 64
-## ── Column specification ────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (41): Kids_First_Participant_ID, Kids_First_Biospecimen_ID, sample_id, a...
## dbl (21): cell_line_passage, OS_days, EFS_days, age_at_diagnosis_days, age_a...
diff --git a/analyses/molecular-subtyping-PB/02-pineoblastoma-umap.Rmd b/analyses/molecular-subtyping-PB/02-pineoblastoma-umap.Rmd
new file mode 100644
index 0000000000..acb65a5a37
--- /dev/null
+++ b/analyses/molecular-subtyping-PB/02-pineoblastoma-umap.Rmd
@@ -0,0 +1,124 @@
+---
+title: "Pineoblastoma UMAP"
+author: "Jo Lynne Rokita"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+Load libraries and set directory paths
+
+```{r}
+suppressPackageStartupMessages({
+ library(tidyverse)
+ library(umap)
+ library(ggplot2)
+ library(devtools)
+ library(gdata)
+ library(ggpubr)
+ library(patchwork)
+})
+
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+data_dir <- file.path(root_dir, "data")
+analysis_dir <- file.path(root_dir, "analyses", "molecular-subtyping-PB")
+results_dir <- file.path(analysis_dir, "results")
+plots_dir <- file.path(analysis_dir, "plot")
+
+source(file.path(root_dir, "figures", "manuscript_OPC", "utils", "theme_for_plot.R"))
+```
+
+Set file paths
+
+```{r}
+hist_file <- file.path(data_dir, "histologies-base.tsv")
+subtype_file <- file.path(results_dir, "pineo-molecular-subtypes.tsv")
+methyl_file <- file.path(data_dir, "methyl-beta-values.rds")
+
+```
+
+Wrangle data.
+
+```{r get methyl ids}
+hist <- read_tsv(hist_file)
+```
+
+Filter hist for methyl samples, and append to subtype df
+
+```{r}
+hist_methyl <- hist %>%
+ dplyr::filter(pathology_diagnosis == "Pineoblastoma",
+ experimental_strategy == "Methylation")
+
+subtypes <- read_tsv(subtype_file) %>%
+ right_join(hist_methyl) %>%
+ filter(!is.na(molecular_subtype_methyl)) %>%
+ dplyr::rename(Kids_First_Biospecimen_ID_methyl = Kids_First_Biospecimen_ID) %>%
+ # remove anything not PB and also the to be classified sample
+ # update one with germline pathogenic variant (GPV) reported in Fiorca, et al 2024
+ mutate(molecular_subtype = case_when(Kids_First_Biospecimen_ID_methyl == "BS_SG2X2XQB" ~ paste0(molecular_subtype, " + DROSHA GPV"),
+ TRUE ~ molecular_subtype),
+ molecular_subtype_methyl = case_when(Kids_First_Biospecimen_ID_methyl == "BS_SG2X2XQB" ~ paste0(molecular_subtype_methyl, " + DROSHA GPV"),
+ TRUE ~ molecular_subtype_methyl))
+
+```
+
+Get number of samples by MB SHH subtype
+
+```{r}
+table(hist_methyl$molecular_subtype)
+```
+
+Load methylation data and filter for ids in `mb_shh_subtypes`
+
+```{r load methyl}
+methyl <- readRDS(methyl_file)
+
+pineo_methyl <- methyl[,colnames(methyl) %in% c("Probe_ID", subtypes$Kids_First_Biospecimen_ID_methyl)]
+
+pineo_methyl <- pineo_methyl %>%
+ distinct(Probe_ID, .keep_all = TRUE) %>%
+ column_to_rownames("Probe_ID")
+```
+
+Identify 20k most variable probes among MB samples
+
+```{r}
+methyl_var <- apply(pineo_methyl, 1, var, na.rm = TRUE)
+var_probes <- names(sort(methyl_var, decreasing = TRUE)[1:20000])
+```
+
+Generate UMAP results
+
+```{r}
+set.seed(2024)
+# neighbors needs to be low because the sample size is very low right now
+umap_results <- umap::umap(t(pineo_methyl[var_probes, ]), n_neighbors = 5)
+
+umap_plot_df <- data.frame(umap_results$layout) %>%
+ tibble::rownames_to_column("Kids_First_Biospecimen_ID_methyl") %>%
+ left_join(subtypes)
+```
+
+Plot UMAP with molecular subtype and age range
+
+```{r}
+umap_pineo <- ggplot(umap_plot_df, aes(x = X1,
+ y = X2,
+ fill = molecular_subtype_methyl)) +
+ geom_point(alpha = 0.8, size = 3.5, shape = 21, stroke = 0.8, color = "black") +
+ labs(fill = "Molecular subtype") +
+ theme_bw() +
+ xlab("UMAP1") +
+ ylab("UMAP2") +
+ theme_Publication()
+
+ggsave(file.path(plots_dir, "umap_pineo.pdf"),
+ umap_pineo,
+ width = 7, height = 3.5)
+```
+
+Save session info
+```{r}
+sessionInfo()
+```
\ No newline at end of file
diff --git a/analyses/molecular-subtyping-PB/README.md b/analyses/molecular-subtyping-PB/README.md
index f4f1ea176a..75e8b9bc93 100644
--- a/analyses/molecular-subtyping-PB/README.md
+++ b/analyses/molecular-subtyping-PB/README.md
@@ -20,6 +20,8 @@ bash run-molecular-subtyping-PB.sh
This folder contains scripts tasked to molecularly subtype PB samples
+`00-PB-select-pathology-dx.R` - filter to pineoblastoma diagnoses
+`01-molecular-subtype-pineoblastoma.Rmd` - Subtype pineoblastoma:
* Filter the samples with `dkfz_v12_methylation_subclass >=0.8` and `cns_methylation_subclass` is one of the three PB subtypes ->
* PB_FOXR2: pineoblastoma, MYC/FOXR2-activated (PB, MYC/FOXR2)
* PB_GRP1A, PB_GRP1B: pineoblastoma, miRNA processing-altered 1 (PB, Group 1)
@@ -28,4 +30,7 @@ This folder contains scripts tasked to molecularly subtype PB samples
* If methylation does not exist or `dkfz_v12_methylation_subclass < 0.8` for any PB samples -> `PB, To be clasified`
-Final results is a table with `Kids_First_Biospecimen_ID`, `Kids_First_Participant_ID`, `sample_id`, `molecular_subtype_methyl`, and `molecular_subtype`, and saved as `pb-molecular-subtype.tsv`
\ No newline at end of file
+Final results is a table with `Kids_First_Biospecimen_ID`, `Kids_First_Participant_ID`, `sample_id`, `molecular_subtype_methyl`, and `molecular_subtype`, and saved as `pb-molecular-subtype.tsv`.
+
+`02-pineoblastoma-umap.Rmd` - Create umap of top 20k most variable methylation probes and plot the pineoblastoma subtypes.
+
diff --git a/analyses/molecular-subtyping-PB/run-molecular-subtyping-PB.sh b/analyses/molecular-subtyping-PB/run-molecular-subtyping-PB.sh
index e3ba0fdf63..dbcce71ba6 100755
--- a/analyses/molecular-subtyping-PB/run-molecular-subtyping-PB.sh
+++ b/analyses/molecular-subtyping-PB/run-molecular-subtyping-PB.sh
@@ -3,11 +3,49 @@
set -e
set -o pipefail
-# set up running directory
-cd "$(dirname "${BASH_SOURCE[0]}")"
+# This script should always run as if it were being called from
+# the directory it lives in.
+script_directory="$(perl -e 'use File::Basename;
+ use Cwd "abs_path";
+ print dirname(abs_path(@ARGV[0]));' -- "$0")"
+cd "$script_directory" || exit
+
+# This will be turned off in CI
+SUBSET=${OPENPBTA_SUBSET:-1}
+
+scratch_path="../../scratch/"
+data_dir="../../data"
+
+
# Run R script to generate JSON file
Rscript --vanilla 00-PB-select-pathology-dx.R
# Run R script to subtype PB using methylation data
Rscript -e "rmarkdown::render('01-molecular-subtype-pineoblastoma.Rmd', clean = TRUE)"
+
+
+if [ "$SUBSET" -gt "0" ]; then
+ echo "check whether methylation files exist"
+ URL="https://d3b-openaccess-us-east-1-prd-pbta.s3.amazonaws.com/open-targets"
+ RELEASE="v15"
+ BETA="methyl-beta-values.rds"
+
+ if [ -f "${data_dir}/${BETA}" ]; then
+ echo "${BETA} exists, skip downloading"
+ echo "run pineoblastoma clustering"
+ # add umap
+ Rscript -e "rmarkdown::render('02-pineoblastoma-umap.Rmd', clean = TRUE)"
+ else
+ echo "${BETA} does not exist, downloading..."
+ wget ${URL}/${RELEASE}/${BETA} -P ${data_dir}/${RELEASE}/
+ cd ${data_dir}
+ ln -sfn ${RELEASE}/${BETA} ./${BETA}
+ cd ../analyses/molecular-subtyping-PB
+ echo "run pineoblastoma clustering"
+ # add umap
+ Rscript -e "rmarkdown::render('02-pineoblastoma-umap.Rmd', clean = TRUE)"
+ fi
+else
+ echo "SUBSET is not greater than 0 or not a valid number."
+fi