8.1.1. Survival Analysis - TCGA - Necroptosis - Unique Genes - Tumor.Rmd

---
title: "Survival Analysis"
subtitle: "Colorectal Cancer | Necroptosis | Unique Genes per RCD Type | Gene Expression of Tumor Samples"
author: 
  - Mark Edward M. Gonzales^[De La Salle University, Manila, Philippines, gonzales.markedward@gmail.com]
  - Dr. Anish M.S. Shrestha^[De La Salle University, Manila, Philippines, anish.shrestha@dlsu.edu.ph]
output: html_notebook
---

## I. Preliminaries

### Loading libraries

```{r, warning=FALSE, message=FALSE}
library("tidyverse")
library("tibble")
library("msigdbr")
library("ggplot2")
library("TCGAbiolinks")
library("RNAseqQC")
library("DESeq2")
library("ensembldb")
library("purrr")
library("magrittr")
library("vsn")
library("matrixStats")
library("dplyr")
library("grex")
library("survminer")
library("survival")
```

## II. Downloading the TCGA gene expression data 

Create a function for downloading TCGA gene expression data. 

For more detailed documentation, refer to `2. Differential Gene Expression Analysis - TCGA.Rmd`.

```{r}
query_and_filter_samples <- function(project) {
  query_tumor <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = "Primary Tumor"
  )
  tumor <- getResults(query_tumor)

  query_normal <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = "Solid Tissue Normal"
  )
  normal <- getResults(query_normal)

  submitter_ids <- inner_join(tumor, normal, by = "cases.submitter_id") %>%
    dplyr::select(cases.submitter_id)
  tumor <- tumor %>%
    dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
  normal <- normal %>%
    dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)

  samples <- rbind(tumor, normal)
  unique(samples$sample_type)

  query_project <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = c("Solid Tissue Normal", "Primary Tumor"),
    barcode = as.list(samples$sample.submitter_id)
  )

  # If this is your first time running this notebook (i.e., you have not yet downloaded the results of the query in the previous block),
  # uncomment the line below

  # GDCdownload(query_project)

  return(list(samples = samples, query_project = query_project))
}
```

Download the TCGA gene expression data for colorectal cancer (TCGA-COAD).

```{r, echo = TRUE, message = FALSE, results="hide"}
projects <- c("TCGA-COAD")

with_results_projects <- c()

samples <- list()
project_data <- list()

for (project in projects) {
  result <- tryCatch(
    {
      result <- query_and_filter_samples(project)
      samples[[project]] <- result$samples
      project_data[[project]] <- result$query_project

      with_results_projects <- c(with_results_projects, project)
    },
    error = function(e) {

    }
  )
}
```

Running the code block above should generate and populate a directory named `GDCdata`.

## III. Data preprocessing

Construct the RNA-seq count matrix for each cancer type.

```{r, echo = TRUE, message = FALSE, results="hide"}
tcga_data <- list()
tcga_matrix <- list()

projects <- with_results_projects
for (project in projects) {
  tcga_data[[project]] <- GDCprepare(project_data[[project]], summarizedExperiment = TRUE)
}
```

```{r}
for (project in projects) {
  count_matrix <- assay(tcga_data[[project]], "unstranded")

  # Remove duplicate entries
  count_matrix_df <- data.frame(count_matrix)
  count_matrix_df <- count_matrix_df[!duplicated(count_matrix_df), ]
  count_matrix <- data.matrix(count_matrix_df)
  rownames(count_matrix) <- cleanid(rownames(count_matrix))
  count_matrix <- count_matrix[!(duplicated(rownames(count_matrix)) | duplicated(rownames(count_matrix), fromLast = TRUE)), ]

  tcga_matrix[[project]] <- count_matrix
}
```
Format the `samples` table so that it can be fed as input to DESeq2.

```{r}
for (project in projects) {
  rownames(samples[[project]]) <- samples[[project]]$cases
  samples[[project]] <- samples[[project]] %>%
    dplyr::select(case = "cases.submitter_id", type = "sample_type")
  samples[[project]]$type <- str_replace(samples[[project]]$type, "Solid Tissue Normal", "normal")
  samples[[project]]$type <- str_replace(samples[[project]]$type, "Primary Tumor", "tumor")
}
```

DESeq2 requires the row names of `samples` should be identical to the column names of `count_matrix`.

```{r, echo = TRUE, results="hide"}
for (project in projects) {
  colnames(tcga_matrix[[project]]) <- gsub(x = colnames(tcga_matrix[[project]]), pattern = "\\.", replacement = "-")
  tcga_matrix[[project]] <- tcga_matrix[[project]][, rownames(samples[[project]])]

  # Sanity check
  print(all(colnames(tcga_matrix[[project]]) == rownames(samples[[project]])))
}
```

## IV. Differential gene expression analysis

For more detailed documentation on obtaining the gene set, refer to `7. Differential Gene Expression Analysis - TCGA - Pan-cancer - Unique Genes.Rmd`.

```{r}
RCDdb <- "temp/unique_genes/necroptosis_ferroptosis_pyroptosis/"
```

Write utility functions for filtering the gene sets, performing differential gene expression analysis, plotting the results, and performing variance-stabilizing transformation.

```{r}
filter_gene_set_and_perform_dgea <- function(genes) {
  tcga_rcd <- list()

  for (project in projects) {
    rownames(genes) <- genes$gene_id
    tcga_rcd[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% genes$gene_id, ]
    tcga_rcd[[project]] <- tcga_rcd[[project]][, rownames(samples[[project]])]
  }

  dds_rcd <- list()
  res_rcd <- list()

  for (project in projects) {
    print(project)
    print("=============")
    dds <- DESeqDataSetFromMatrix(
      countData = tcga_rcd[[project]],
      colData = samples[[project]],
      design = ~type
    )
    dds <- filter_genes(dds, min_count = 10)
    dds$type <- relevel(dds$type, ref = "normal")
    dds_rcd[[project]] <- DESeq(dds)
    res_rcd[[project]] <- results(dds_rcd[[project]])
  }

  deseq.bbl.data <- list()

  for (project in projects) {
    deseq.results <- res_rcd[[project]]
    deseq.bbl.data[[project]] <- data.frame(
      row.names = rownames(deseq.results),
      baseMean = deseq.results$baseMean,
      log2FoldChange = deseq.results$log2FoldChange,
      lfcSE = deseq.results$lfcSE,
      stat = deseq.results$stat,
      pvalue = deseq.results$pvalue,
      padj = deseq.results$padj,
      cancer_type = project,
      gene_symbol = genes[rownames(deseq.results), "gene"]
    )
  }

  deseq.bbl.data.combined <- bind_rows(deseq.bbl.data)
  deseq.bbl.data.combined <- dplyr::filter(deseq.bbl.data.combined, abs(log2FoldChange) >= 1.5 & padj < 0.05)

  return(deseq.bbl.data.combined)
}
```

```{r}
plot_dgea <- function(deseq.bbl.data.combined) {
  sizes <- c("<10^-15" = 4, "10^-10" = 3, "10^-5" = 2, "0.05" = 1)

  deseq.bbl.data.combined <- deseq.bbl.data.combined %>%
    mutate(fdr_category = cut(padj,
      breaks = c(-Inf, 1e-15, 1e-10, 1e-5, 0.05),
      labels = c("<10^-15", "10^-10", "10^-5", "0.05"),
      right = FALSE
    ))

  top_genes <- deseq.bbl.data.combined %>%
    group_by(cancer_type) %>%
    mutate(rank = rank(-abs(log2FoldChange))) %>%
    dplyr::filter(rank <= 10) %>%
    ungroup()

  ggplot(top_genes, aes(y = cancer_type, x = gene_symbol, size = fdr_category, fill = log2FoldChange)) +
    geom_point(alpha = 0.5, shape = 21, color = "black") +
    scale_size_manual(values = sizes) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.combined$log2FoldChange), max(deseq.bbl.data.combined$log2FoldChange))) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(size = 9, angle = 90, hjust = 1)
    ) +
    theme(legend.position = "bottom") +
    theme(legend.position = "bottom") +
    labs(size = "Adjusted p-value", fill = "log2 FC", y = "Cancer type", x = "Gene")
}
```

```{r}
perform_vsd <- function(genes) {
  tcga_rcd <- list()

  for (project in projects) {
    rownames(genes) <- genes$gene_id
    tcga_rcd[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% genes$gene_id, ]
    tcga_rcd[[project]] <- tcga_rcd[[project]][, rownames(samples[[project]])]
  }

  vsd_rcd <- list()

  for (project in projects) {
    print(project)
    print("=============")
    dds <- DESeqDataSetFromMatrix(
      countData = tcga_rcd[[project]],
      colData = samples[[project]],
      design = ~type
    )
    dds <- filter_genes(dds, min_count = 10)

    # Perform variance stabilization
    dds <- estimateSizeFactors(dds)
    nsub <- sum(rowMeans(counts(dds, normalized = TRUE)) > 10)
    vsd <- vst(dds, nsub = nsub)
    vsd_rcd[[project]] <- assay(vsd)
  }

  return(vsd_rcd)
}
```


#### Necroptosis

Fetch the gene set of interest.

```{r}
genes <- read.csv(paste0(RCDdb, "Necroptosis.csv"))
print(genes)
genes$gene_id <- cleanid(genes$gene_id)
genes <- distinct(genes, gene_id, .keep_all = TRUE)
genes <- subset(genes, gene_id != "")
genes
```

Filter the genes to include only those in the gene set of interest, and then perform differential gene expression analysis.

```{r}
deseq.bbl.data.combined <- filter_gene_set_and_perform_dgea(genes)
deseq.bbl.data.combined
```

Plot the results.

```{r}
plot_dgea(deseq.bbl.data.combined)
```
Perform variance-stabilizing transformation for further downstream analysis (i.e., for survival analysis).

```{r, warning=FALSE}
vsd <- perform_vsd(genes)
```

## V. Downloading the clinical data

Download clinical data from TCGA, and perform some preprocessing:
- The `deceased` column should be `FALSE` if the patient is alive and `TRUE` otherwise
- The `overall_survival` column should reflect the follow-up time if the patient is alive and the days to death otherwise

```{r}
download_clinical_data <- function(project) {
  clinical_data <- GDCquery_clinic(project)
  clinical_data$deceased <- ifelse(clinical_data$vital_status == "Alive", FALSE, TRUE)
  clinical_data$overall_survival <- ifelse(clinical_data$vital_status == "Alive",
    clinical_data$days_to_last_follow_up,
    clinical_data$days_to_death
  )

  return(clinical_data)
}
```

```{r}
tcga_clinical <- list()
for (project in projects) {
  tcga_clinical[[project]] <- download_clinical_data(project)
}
```

## VI. Performing survival analysis

Write utility functions for performing survival analysis.


```{r}
construct_gene_df <- function(gene_of_interest, project) {
  gene_df <- vsd[[project]] %>%
    as.data.frame() %>%
    rownames_to_column(var = "gene_id") %>%
    gather(key = "case_id", value = "counts", -gene_id) %>%
    left_join(., genes, by = "gene_id") %>%
    dplyr::filter(gene == gene_of_interest) %>%
    dplyr::filter(case_id %in% rownames(samples[[project]] %>% dplyr::filter(type == "tumor")))

  q1 <- quantile(gene_df$counts, probs = 0.25)
  q3 <- quantile(gene_df$counts, probs = 0.75)
  gene_df$strata <- ifelse(gene_df$counts >= q3, "HIGH", ifelse(gene_df$counts <= q1, "LOW", "MIDDLE"))
  gene_df <- gene_df %>% dplyr::filter(strata %in% c("LOW", "HIGH"))
  gene_df$case_id <- paste0(sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 1), '-',
                          sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 2), '-', 
                          sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 3))
  gene_df <- merge(gene_df, tcga_clinical[[project]], by.x = "case_id", by.y = "submitter_id")
  
  return(gene_df)
}
```

```{r}
compute_surival_fit <- function(gene_df) {
  return (survfit(Surv(overall_survival, deceased) ~ strata, data = gene_df))
}
```

```{r}
compute_cox <- function(gene_df) {
  return (coxph(Surv(overall_survival, deceased) ~ strata, data=gene_df))
}
```

```{r}
plot_survival <- function(fit) {
  return(ggsurvplot(fit,
    data = gene_df,
    pval = T,
    risk.table = T,
    risk.table.height = 0.3
  ))
}
```

```{r}
compute_survival_diff <- function(gene_df) {
  return(survdiff(Surv(overall_survival, deceased) ~ strata, data = gene_df))
}
```

Perform survival analysis by testing for the difference in the Kaplan-Meier curves using the G-rho family of Harrington and Fleming tests: https://rdrr.io/cran/survival/man/survdiff.html

MLKL is the primary executor of necroptosis.

```{r}
significant_projects <- c()
significant_genes <- c()

ctr <- 1
for (project in projects) {
  for (gene in c("MLKL", genes$gene)) {
    cat(project, gene, "\n\n")
    cat(project, gene, "\n\n")
    error <- tryCatch (
      {
        gene_df <- construct_gene_df(gene, project)
      },
      error = function(e) {
        cat("\n\n============================\n\n")
        e
      }
    )
    
    if(inherits(error, "error")) next

    if (nrow(gene_df) > 0) {
      fit <- compute_surival_fit(gene_df)
      tryCatch (
        {
          survival <- compute_survival_diff(gene_df)
          cox <- compute_cox(gene_df)
          print(ctr)
          ctr <- ctr + 1
          print(survival)
          cat("\n")
          print(cox)
          print(plot_survival(fit))
          if (pchisq(survival$chisq, length(survival$n)-1, lower.tail = FALSE) < 0.05) {
            significant_projects <- c(significant_projects, project)
            significant_genes <- c(significant_genes, gene)
          }
        },
        error = function(e) {
        }
      )
      
    }
    
    cat("\n\n============================\n\n")
  }
}
```

Display the results only for genes where a significant difference in survival has been reported.

```{r}
significant_genes
```

```{r}
num_significant_genes <- length(significant_genes)

if (num_significant_genes > 0) {
  for (i in 1 : num_significant_genes) {
    project <- significant_projects[[i]]
    gene <- significant_genes[[i]]
    
    cat(project, gene, "\n\n")
    gene_df <- construct_gene_df(gene, project)
    
    fit <- compute_surival_fit(gene_df)
    survival <- compute_survival_diff(gene_df)
    cox <- compute_cox(gene_df)
    print(survival)
    cat("\n")
    print(cox)
    print(plot_survival(fit))
    
    cat("\n\n============================\n\n")
  } 
}
```