-
Notifications
You must be signed in to change notification settings - Fork 1
/
01_DimReduc.R
125 lines (103 loc) · 5.16 KB
/
01_DimReduc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Loading necessary packages
pacman::p_load(data.table, uwot, Rtsne, ggplot2, argparse)
# Function to print log messages
print_log <- function(message){
print(paste0("[", Sys.time(), "] ", message))
}
# Arguments for the script
parser <- argparse::ArgumentParser()
parser$add_argument("--expr_data", default = "../Brain_Frontal_Cortex_BA9.v8.residuals.matrix.csv", help = "Expression data file")
parser$add_argument("--pathway_data", default = "./GeneSets.csv", help = "Pathway data file (1st column pathway names, 2nd column genes in pathway)")
parser$add_argument("--gene_data", default = "../../IBAS-Stability-Internal/gencode.v26.GRCh38.genes.gtf", help = "Gene data file (gtf)")
parser$add_argument("--seed", default = 1234, help = "Seed for reproducibility")
parser$add_argument("--output_dir", default = "./Brain_Frontal_Cortex_01_DimReduc", help = "Output directory")
print_log("Parsing arguments")
args <- parser$parse_args()
print_log("Arguments parsed:")
print_log(paste0("Expression data: ", args$expr_data))
print_log(paste0("Pathway data: ", args$pathway_data))
print_log(paste0("Gene data: ", args$gene_data))
print_log(paste0("Seed: ", args$seed))
print_log(paste0("Output directory: ", args$output_dir))
print_log("Setting seed")
set.seed(args$seed)
print_log("Checking if output directory exists and creating if not")
if(!dir.exists(args$output_dir)){
dir.create(args$output_dir, recursive = TRUE)
}
print_log("Checking if expression data exists")
if(!file.exists(args$expr_data)){
stop("Expression data file does not exist")
}
print_log("Expression data exists")
print_log("Checking if pathway data exists")
if(!file.exists(args$pathway_data)){
stop("Pathway data file does not exist")
}
print_log("Pathway data exists")
print_log("Checking if gene data exists")
if(!file.exists(args$gene_data)){
stop("Gene data file does not exist")
}
print_log("Gene data exists")
print_log("Reading in gene data")
gencode <- as.data.table(stringr::str_split(fread(args$gene_data)$V9, ";", simplify = TRUE)[, c(1,4)])
colnames(gencode) <- c("geneID", "geneName")
gencode$geneID <- stringr::str_remove(gencode$geneID, "gene_id ")
gencode$geneName <- stringr::str_remove(gencode$geneName, "gene_name ")
gencode$geneID <- gsub('[\"]', '', gencode$geneID)
gencode$geneName <- gsub('[\"]', '', gencode$geneName)
gencode$geneName <- trimws(gencode$geneName, which = "both")
gencode <- unique(gencode)
print_log("Reading in pathway data")
pathway_data <- fread(args$pathway_data)
if(ncol(pathway_data) < 2){
stop("Pathway data file must have at least 2 columns")
}
pathway_data <- pathway_data[, c(1,2)]
colnames(pathway_data) <- c("Pathway", "Genes")
# Detect string delimiter in Genes column
if(sum(stringr::str_count(pathway_data$Genes, ";")) == 0){
stop("Genes column must be delimited by ;")
}
# Split genes column by delimiter and create named vector using names in GeneSet
pathway_genes <- stringr::str_split(pathway_data$Genes, ";")
names(pathway_genes) <- pathway_data$Pathway
# Creating unique list of genes
all_genes <- unique(unlist(pathway_genes))
# Identifying corresponding gene IDs
all_gene_ids <- gencode[gencode$geneName %in% all_genes, geneID]
print_log("Reading in expression data")
expr_data <- fread(args$expr_data)
print_log("Assuming matrix is of the form samples (rows) x genes (columns)")
print_log("Converting to matrix of gene expression")
expr_matrix <- t(as.matrix(expr_data[,-c(1)]))
colnames(expr_matrix) <- expr_data[[1]]
# print_log("Selecting only genes that are in the pathways provided")
# expr_matrix <- expr_matrix[,all_gene_ids]
for(i in seq(1, length(names(pathway_genes)))){
curr_pathway <- names(pathway_genes)[i]
print_log(paste0("Working on pathway ", curr_pathway, " (pathway ", i, " of ", length(names(pathway_genes)), ")"))
curr_gene_ids <- gencode[gencode$geneName %in% pathway_genes[[curr_pathway]], geneID]
curr_gene_ids <- curr_gene_ids[curr_gene_ids %in% colnames(expr_matrix)]
curr_expr_matrix <- expr_matrix[,curr_gene_ids]
print_log("Removing genes with zero variance")
curr_expr_matrix <- curr_expr_matrix[, apply(curr_expr_matrix, 2, var) > 0]
pathway_umap <- umap(curr_expr_matrix,
n_components = min(10, ncol(curr_expr_matrix)),
metric = "correlation",
n_epochs = 1000, verbose = FALSE)
umap_df <- as.data.table(pathway_umap)
umap_df <- cbind(data.table(ID = rownames(curr_expr_matrix)), umap_df)
pathway_tsne <- Rtsne(curr_expr_matrix,
verbose = FALSE)
tsne_df <- as.data.table(pathway_tsne$Y)
tsne_df <- cbind(data.table(ID = rownames(curr_expr_matrix)), tsne_df)
pathway_pca <- prcomp(curr_expr_matrix, center = TRUE, scale. = TRUE)
pca_df <- as.data.table(pathway_pca$x[, seq(1,min(10, ncol(curr_expr_matrix)))])
pca_df <- cbind(data.table(ID = rownames(curr_expr_matrix)), pca_df)
print_log("Saving UMAP, tSNE and PCA embeddings")
fwrite(umap_df, file.path(args$output_dir, paste0(curr_pathway, "_umap.tsv")), sep = "\t")
fwrite(tsne_df, file.path(args$output_dir, paste0(curr_pathway, "_tsne.tsv")), sep = "\t")
fwrite(pca_df, file.path(args$output_dir, paste0(curr_pathway, "_pca.tsv")), sep = "\t")
}