diff --git a/code/association_scan/quantile_twas/quantile_twas.ipynb b/code/association_scan/quantile_twas/quantile_twas.ipynb index edd6c96f..96812828 100644 --- a/code/association_scan/quantile_twas/quantile_twas.ipynb +++ b/code/association_scan/quantile_twas/quantile_twas.ipynb @@ -106,7 +106,7 @@ "\n", "For each analysis region, the output is quantile qtl model and quantile twas fitted and saved in RDS format.\n", "\n", - "For each gene, several of summary statistics files are generated, including significant quantile qtl nominal test statistics for each test.\n", + "For each gene, several of summary statistics files are generated, including all quantile qtl nominal test statistics for each test.\n", "\n", "The columns of significant quantile qtl nominal association result are as follows:\n", "\n", @@ -120,8 +120,9 @@ "- p_qr_0.05 to p_qr_0.95: quantile-specific QR p-values for the quantile levels 0.05, 0.1, ..., 0.95. \n", "- qvalue_qr: the q-value of p_qr. \n", "- qvalue_qr_0.05 to qvalue_qr_0.95: quantile-specific QR q-values for the quantile levels 0.05, 0.1, ..., 0.95. \n", - "- coef_qr_0.05 to coef_qr_0.95: quantile-specific QR coefficients for the quantile levels 0.05, 0.1, ..., 0.95. \n", - "- se_qr_0.05 to se_qr_0.95: quantile-specific QR standard errors for the quantile levels 0.05, 0.1, ..., 0.95. \n", + "- zscore_qr_0.05 to zscore_qr_0.95: quantile-specific QR standard errors for the quantile levels 0.05, 0.1, ..., 0.95. \n", + "- coef_qr_0.05 to coef_qr_0.95 only for snps after LD clumping: quantile-specific QR coefficients for the quantile levels 0.05, 0.1, ..., 0.95. \n", + "\n", "\n", "\n", "Additionally, the matrix of quantile TWAS weights and pseudo R-squares is calculated and saved using Koenker and Machado's Pseudo R² method, as described in their [Koenker and Machado, 1999: Inference in Quantile Regression](https://www.maths.usyd.edu.au/u/jchan/GLM/Koenker&Machado1999InferenceQuantileReg.pdf)" @@ -681,10 +682,7 @@ "R: expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container, entrypoint = entrypoint\n", " options(warn=1)\n", " library(pecotmr)\n", - " library(quantreg)\n", - " library(qvalue)\n", - " library(tidyverse)\n", - " library(dplyr) \n", + " \n", " start_time_total <- proc.time()\n", " \n", " phenotype_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input[1:len(_input)//2+1]])})\n", @@ -696,201 +694,6 @@ " phenotype_header = ${\"4\" if int(_meta_info[0].split('-')[-1])>0 else \"1\"}\n", " region_name_col = ${\"4\" if int(_meta_info[0].split('-')[-1])>0 else \"1\"} \n", "\n", - " # functions\n", - " compute_qvalues <- function(pvalues) {\n", - " tryCatch({\n", - " if(length(pvalues) < 2) {\n", - " return(pvalues)\n", - " } else {\n", - " return(qvalue(pvalues)$qvalues)\n", - " }\n", - " }, error = function(e) {\n", - " message(\"Too few p-values to calculate qvalue, fall back to BH\")\n", - " qvalue(pvalues, pi0 = 1)$qvalues\n", - " })\n", - " }\n", - " \n", - " # Cauchy combination\n", - " Get.cauchy<-function(p, na.rm = T){\n", - " if(na.rm){\n", - " if(sum(is.na(p))){\n", - " p = p[!is.na(p)]\n", - " }\n", - " }\n", - " p[p>0.99]<-0.99\n", - " is.small<-(p<1e-16) & !is.na(p)\n", - " is.regular<-(p>=1e-16) & !is.na(p)\n", - " temp<-rep(NA,length(p))\n", - " temp[is.small]<-1/p[is.small]/pi\n", - " temp[is.regular]<-as.numeric(tan((0.5-p[is.regular])*pi))\n", - " \n", - " cct.stat<-mean(temp,na.rm=T)\n", - " if(is.na(cct.stat)){return(NA)}\n", - " if(cct.stat>1e+15){return((1/cct.stat)/pi)}else{\n", - " return(1-pcauchy(cct.stat))\n", - " }\n", - " }\n", - "\n", - " #FIXME: although calculate top_count and top_percent snps, we only use threshold to filter snps for now. \n", - " QR_screen <- function(ExprData, tau.list, threshold = 0.05, method = 'qvalue', top_count = 10, top_percent = 15){\n", - " p = ncol(ExprData$X)\n", - " # Generate random column names for SNPs\n", - " print(colnames(ExprData$X))\n", - " \n", - " pvec = rep(NA, p)\n", - " ltau = length(tau.list)\n", - " # Initialize quantile.pvalue with appropriate column names\n", - " quantile.pvalue = matrix(NA, nrow = p, ncol = ltau, dimnames = list(colnames(ExprData$X), paste(\"p_qr\", tau.list, sep = \"_\")))\n", - " \n", - " for(ip in 1:p){\n", - " x = as.matrix(ExprData$X[,ip])\n", - " y = as.matrix(ExprData$Y)\n", - " zz = cbind(rep(1, nrow(y)), ExprData$C)\n", - " VN = matrix(0, nrow = ltau, ncol = ltau)\n", - " for (i in 1:ltau) {\n", - " for (j in 1:ltau) {\n", - " VN[i, j] = min(tau.list[i], tau.list[j]) - tau.list[i] * tau.list[j]\n", - " }\n", - " }\n", - " xstar = lm(x ~ zz - 1)$residual\n", - " SN = NULL\n", - " for (itau in 1:ltau) {\n", - " ranks = suppressWarnings(rq.fit.br(zz, y, tau = tau.list[itau])$dual - \n", - " (1 - tau.list[itau]))\n", - " Sn = as.matrix(t(xstar) %*% (ranks))\n", - " SN = c(SN, Sn)\n", - " }\n", - " VN2 = matrix(outer(VN, t(xstar) %*% xstar, \"*\"), nrow = ltau)\n", - " pvalue1 = pchisq(SN^2/diag(VN2), 1, lower.tail = F)\n", - " names(pvalue1) = tau.list\n", - " # Store quantile-specific p-values directly in the correct place\n", - " quantile.pvalue[ip, ] <- pvalue1\n", - " e = solve(chol(VN2))\n", - " SN2 = t(e) %*% SN\n", - " # composite p-value\n", - " pvalue = pchisq(sum(SN2^2), ltau, lower.tail = F)\n", - " \n", - " }\n", - "\n", - " # cauchy combined p-value\n", - " pvec <- apply(quantile.pvalue, 1, Get.cauchy)\n", - " # adjusted pvalue\n", - " if (method == 'fdr') {\n", - " adjusted_pvalues = p.adjust(pvec)\n", - " method_col_name = \"fdr_p_qr\"\n", - " method_quantile_names = paste0(\"fdr_p_qr_\", tau.list)\n", - " quantile_adjusted_pvalues = apply(quantile.pvalue, 2, p.adjust) \n", - " } else if (method == 'qvalue') {\n", - " adjusted_pvalues = compute_qvalues(pvec)\n", - " method_col_name = \"qvalue_qr\"\n", - " method_quantile_names = paste0(\"qvalue_qr_\", tau.list)\n", - " quantile_adjusted_pvalues = apply(quantile.pvalue, 2, compute_qvalues) \n", - " } else {\n", - " stop(\"Invalid method. Choose 'fdr' or 'qvalue'.\")\n", - " }\n", - " \n", - " sig_SNP_threshold = which(adjusted_pvalues < threshold)\n", - " sig_SNP_top_count = order(adjusted_pvalues)[1:top_count]\n", - " sig_SNP_top_percent = order(adjusted_pvalues)[1:max(1, round(length(adjusted_pvalues) * top_percent / 100))]\n", - " \n", - " sig.SNPs_names = colnames(ExprData$X)[sig_SNP_threshold]\n", - " sig.SNPs_names_top_count = colnames(ExprData$X)[sig_SNP_top_count]\n", - " sig.SNPs_names_top_percent = colnames(ExprData$X)[sig_SNP_top_percent]\n", - " phenotype_id = colnames(y)[1]\n", - "\n", - " if (length(sig_SNP_threshold) > 0) {\n", - " df_result = data.frame(\n", - " phenotype_id = phenotype_id,\n", - " variant_id = rownames(quantile.pvalue)[sig_SNP_threshold],\n", - " p_qr = pvec[sig_SNP_threshold],\n", - " quantile.pvalue[sig_SNP_threshold, ], \n", - " method_pvalue = adjusted_pvalues[sig_SNP_threshold],\n", - " quantile_adjusted_pvalues[sig_SNP_threshold, ]\n", - " )\n", - " # Rename the 'method_pvalue' column to the appropriate name based on method\n", - " colnames(df_result)[ncol(df_result) - ltau] <- method_col_name \n", - " colnames(df_result)[(ncol(df_result) - ltau + 1):ncol(df_result)] <- method_quantile_names \n", - " } else {\n", - " df_result = data.frame() # Return empty data frame if no significant SNPs\n", - " }\n", - "\n", - " return(list(sig_p_df = df_result,\n", - " tau_list = tau.list, \n", - " quantile_pvalue = quantile.pvalue, \n", - " pvec = pvec, \n", - " adjusted_pvalues = adjusted_pvalues, \n", - " sig_SNP_threshold = sig_SNP_threshold, \n", - " sig_SNP_top_count = sig_SNP_top_count, \n", - " sig_SNP_top_percent = sig_SNP_top_percent))\n", - " }\n", - "\n", - "\n", - " # filter out highly correlated SNPs\n", - " SNPs_Filter <- function(X, cor_thres = 0.8) {\n", - " p <- ncol(X)\n", - " \n", - " # Use Rfast for faster correlation calculation if available\n", - " if (requireNamespace(\"Rfast\", quietly = TRUE)) {\n", - " cor.X <- Rfast::cora(X, large = TRUE)\n", - " } else {\n", - " cor.X <- cor(X)\n", - " } \n", - " Sigma.distance = as.dist(1 - abs(cor.X))\n", - " fit = hclust(Sigma.distance, method=\"single\")\n", - " clusters = cutree(fit, h=1-cor_thres)\n", - " groups = unique(clusters)\n", - " ind.delete = NULL\n", - " X.new = X\n", - " filter.id = c(1:p)\n", - " for (ig in 1:length(groups)) {\n", - " temp.group = which(clusters == groups[ig])\n", - " if (length(temp.group) > 1) {\n", - " ind.delete = c(ind.delete, temp.group[-1])\n", - " }\n", - " }\n", - " ind.delete = unique(ind.delete)\n", - " if ((length(ind.delete) ) > 0) {\n", - " X.new = as.matrix(X[,-ind.delete])\n", - " filter.id = filter.id[-ind.delete]\n", - " }\n", - " return(list(X.new = X.new, filter.id = filter.id))\n", - " }\n", - "\n", - " # Function to calculate coefficients and pseudo R^2 across multiple quantiles: Koenker and Machado (1999)\n", - " calculate_qr_and_pseudo_R2 <- function(ExprData, tau.list) {\n", - " # Fit models for all taus\n", - " fit_full <- rq(Y ~ X.filter + C, tau = tau.list, data = ExprData)\n", - " fit_intercept <- rq(Y ~ 1, tau = tau.list, data = ExprData)\n", - " \n", - " # Define rho function\n", - " rho <- function(u, tau) {\n", - " u * (tau - (u < 0))\n", - " }\n", - " \n", - " # Prepare to store results\n", - " pseudo_R2 <- numeric(length(tau.list))\n", - " names(pseudo_R2) <- tau.list\n", - " \n", - " # Calculate pseudo R^2 for each tau\n", - " for (i in seq_along(tau.list)) {\n", - " tau <- tau.list[i]\n", - " \n", - " # Get residuals\n", - " residuals0 <- residuals(fit_intercept, subset = i)\n", - " residuals1 <- residuals(fit_full, subset = i)\n", - " \n", - " # Calculate and store pseudo R^2\n", - " rho0 <- sum(rho(residuals0, tau))\n", - " rho1 <- sum(rho(residuals1, tau))\n", - " pseudo_R2[i] <- 1 - rho1 / rho0\n", - " }\n", - " \n", - " # Extract coefficients of snps, removing intercept if included\n", - " num_filter_vars <- ncol(ExprData$X.filter)\n", - " beta_mat <- coef(fit_full)[2:(1 + num_filter_vars), , drop = FALSE]\n", - " list(beta_mat = beta_mat, pseudo_R2 = pseudo_R2)\n", - " }\n", - "\n", "\n", " # extract subset of samples\n", " keep_samples = NULL\n", @@ -898,7 +701,7 @@ " keep_samples = unlist(strsplit(readLines(${keep_samples:ar}), \"\\\\s+\"))\n", " message(paste(length(keep_samples), \"samples are selected to be loaded for analysis\"))\n", " }\n", - " \n", + "\n", " # Load regional association data\n", " tryCatch({\n", " fdat = load_regional_association_data(genotype = ${_input[0]:anr},\n", @@ -947,160 +750,47 @@ " condition_names = vector()\n", " r = 1\n", " while (r<=length(fdat$residual_Y)) {\n", - " tau.list = seq(0.05, 0.95, 0.05)\n", - " genotype_df = fdat$X_data[[r]]\n", - " if (is.null(dim(fdat$Y[[r]]))) {\n", - " fdat$Y[[r]] <- matrix(fdat$Y[[r]], nrow = length(fdat$Y[[r]]), ncol = 1)\n", - " } \n", - " phenotype_df = fdat$Y[[r]]\n", - " colnames(phenotype_df) <- extract_region_name[[r]] \n", - " covariates_df = fdat$covar[[r]]\n", - " colnames(fdat$residual_Y[[r]]) <- extract_region_name[[r]] \n", - " X_residual = fdat$residual_X[[r]] \n", - " ExprData = list(X = genotype_df, Y = phenotype_df, C = covariates_df)\n", - "\n", - " # step 1: qrank filter sig p\n", - " qrank_start_time <- proc.time()\n", - " p.screen = QR_screen(ExprData, tau.list, threshold = 0.05, method = \"qvalue\", top_count = 10,top_percent = 15) #$pvec $sig.SNPs\n", - " qrank_end_time <- proc.time()\n", - " qrank_time <- qrank_end_time - qrank_start_time\n", - " print('qrank_time:')\n", - " print(qrank_time)\n", - " print(paste0(\"there are \", length(p.screen$sig_SNP_threshold), \" SNPs selected out of \", length(p.screen$pvec), \" SNPs.\"))\n", - "\n", - " if (length(p.screen$sig_SNP_threshold) == 0) {\n", - " message(paste0(\"No significant SNPs detected in gene \", extract_region_name[[r]], \".\")) \n", - " } else {\n", - " if (length(p.screen$sig_SNP_threshold) == 1) {\n", - " ExprData$X.filter = ExprData$X[, p.screen$sig_SNP_threshold, drop = FALSE]\n", - " snp_to_keep <- colnames(ExprData$X.filter)\n", - " X.filter_residual = X_residual[, snp_to_keep, drop = FALSE] \n", - " print(\"Only one significant SNP, skipping correlation filter.\")\n", - " } else {\n", - " # Step 2: filter out highly correlated SNPs\n", - " print('Proceeding to filter out highly correlated SNPs')\n", - " temp = SNPs_Filter(ExprData$X[, p.screen$sig_SNP_threshold, drop = FALSE], 0.8)\n", - " ExprData$X.filter = temp$X.new\n", - " snp_to_keep <- colnames(temp$X.new)\n", - " # Apply the same filtering to the residual matrix\n", - " X.filter_residual = X_residual[, snp_to_keep, drop = FALSE]\n", - " }\n", - "\n", - " # filter p.screen results of significant pvalue without high corr \n", - " sig_p_df <- p.screen$sig_p_df[rownames(p.screen$sig_p_df) %in% snp_to_keep, ]\n", - " new_names = names(fdat$residual_Y)[r]\n", - "\n", - " if (is.null(names(fdat$residual_Y)[r])) {\n", - " names(fdat$residual_Y)[r] = extract_region_name[[r]] \n", - " } \n", - " \n", - " new_names = names(fdat$residual_Y)[r]\n", - " new_col_names = list(${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])})[[r]]\n", - " if (is.null(new_col_names)) {\n", - " new_col_names = 1:ncol(fdat$residual_Y[[r]])\n", - " }\n", - " if (!(identical(new_names, new_col_names))) {\n", - " new_names = paste(new_names, new_col_names, sep = \"_\")\n", - " } \n", - "\n", - " # step 3: fit QR for 19 tau from 0.05 to 0.95, return beta for each snp.\n", - " residual_Y = fdat$residual_Y[[r]]\n", - " pheno.mat = as.matrix(residual_Y)\n", - " colnames(pheno.mat) = extract_region_name[[r]]\n", - " geno.mat = as.matrix(X.filter_residual) \n", - "\n", - " rq_qtl_start_time <- proc.time()\n", - " result_table_br <- data.frame(phenotype_id = character(),\n", - " variant_id = character(),\n", - " tau = numeric(),\n", - " predictor_coef = numeric(),\n", - " predictor_se = numeric(),\n", - " stringsAsFactors = FALSE)\n", - "\n", - " for (tau in seq(0.05, 0.95, by = 0.05)) {\n", - " for (n in 1:ncol(geno.mat)){\n", - " response <- pheno.mat\n", - " predictor <- geno.mat[, n]\n", - " phenotype_id <- colnames(pheno.mat)\n", - " variant_id <- colnames(geno.mat)[n]\n", - " # Using rq\n", - " mod_br <- rq(response ~ predictor, tau = tau)\n", - " summary_rq <- summary(mod_br, se = 'boot')\n", - " predictor_coef_br <- summary_rq$coefficients[2, 1] #get the coefficient of variant\n", - " predictor_se_br <- summary_rq$coefficients[2, 2] #get the se of variant\n", - " row_br <- data.frame(phenotype_id = phenotype_id, variant_id = variant_id, tau = tau, predictor_coef = predictor_coef_br, predictor_se = predictor_se_br, stringsAsFactors = FALSE)\n", - " result_table_br <- rbind(result_table_br, row_br)\n", - " }\n", - " }\n", - "\n", - " result_table_wide <- result_table_br %>%\n", - " pivot_wider(\n", - " id_cols = c(phenotype_id, variant_id),\n", - " names_from = tau,\n", - " values_from = c(predictor_coef, predictor_se),\n", - " names_glue = \"{.value}_{tau}\"\n", - " )\n", - "\n", - " colnames(result_table_wide)[3:ncol(result_table_wide)] <- \n", - " paste0(rep(c(\"coef_qr_\", \"se_qr_\"), each = length(seq(0.05, 0.95, by = 0.05))), \n", - " rep(seq(0.05, 0.95, by = 0.05), 2))\n", - " cat(\"rq.fit.br result loaded \", dim(result_table_wide), \"\\n\")\n", - " rq_qtl_end_time <- proc.time()\n", - " rq_qtl_time <- rq_qtl_end_time - rq_qtl_start_time \n", - " print('rq_qtl_time:')\n", - " print(rq_qtl_time)\n", - "\n", - " # combined p(composite-p from default QRank and cauchy method) and coef table\n", - " ## QR summary statistics\n", - " ## row: variant\n", - " ## column: p-value, coefficient, se and other parameters\n", - " qr_nominal_result = merge(sig_p_df, result_table_wide, by = c(\"phenotype_id\", \"variant_id\")) %>%\n", - " separate(variant_id, into = c(\"chr\", \"pos\", \"ref\", \"alt\"), sep = \"[:_]\", remove = FALSE) %>%\n", - " mutate(chr = as.numeric(gsub(\"chr\", \"\", chr)), pos = as.numeric(pos))\n", - " cat(\"combined p and beta result loaded: \",dim(qr_nominal_result) , \"\\n\")\n", - " column_names <- colnames(qr_nominal_result)\n", - "\n", - " #change order\n", - " new_order <- c(column_names[3:6], column_names[1:2], column_names[7:length(column_names)])\n", - " qr_nominal_result <- qr_nominal_result[new_order]\n", - " print(head(qr_nominal_result))\n", - "\n", - " # Step 3: fit QR for all taus, all snps in one model \n", - " qr_beta_R2_results_start_time <- proc.time()\n", - "\n", - " tau.list2 = seq(0.01,0.99, 0.01) # 0.01,0.02,0.03,...,0.99, 99 taus\n", - " qr_beta_R2_results <- calculate_qr_and_pseudo_R2(ExprData, tau.list2)\n", - " qr_beta_R2_results_end_time <- proc.time()\n", - " qr_beta_R2_results_time <- qr_beta_R2_results_end_time - qr_beta_R2_results_start_time\n", - " print('qr_beta_R2_results_time:')\n", - " print(qr_beta_R2_results_time)\n", - " beta.mat = qr_beta_R2_results$beta_mat\n", - " rownames_beta <- rownames(beta.mat)\n", - " rownames(beta.mat) <- gsub(\"^X.filter\", \"\", rownames_beta) \n", - " pseudo_R2 = qr_beta_R2_results$pseudo_R2\n", - "\n", - " # define twas weight\n", - " ## FIXME: so far we use beta as twas weight, and just calculate pseudo R^2 without actual filtering out.\n", - " twas_weight = beta.mat\n", + " X = fdat$X_data[[r]] \n", + " Y <- fdat$Y[[r]]\n", + " if (is.null(dim(Y))) {\n", + " Y <- matrix(Y, nrow = length(Y), ncol = 1)\n", + " }\n", + " colnames(Y) <- extract_region_name[[r]] \n", + " Z <- fdat$covar[[r]]\n", "\n", - " quantile_twas_prediction = ExprData$X.filter%*%beta.mat\n", + " qr_results = quantile_twas_weight_pipeline(X = X, \n", + " Y = Y, \n", + " Z = Z, \n", + " maf = NULL, #only use for LD pruning, it will calculate MAF automatically\n", + " extract_region_name = extract_region_name[[r]],\n", + " quantile_qtl_tau_list = seq(0.05, 0.95, 0.05),\n", + " quantile_twas_tau_list = seq(0.01, 0.99, by = 0.01))\n", "\n", - " # save info of ExprData$X.filter, qr_nominal_result, twas_weight, pseudo_R2 \n", - " # Store results in the fitted list for this iteration\n", - " fitted[[r]] <- list(\n", - " variant_names <- colnames(ExprData$X.filter),\n", - " qr_nominal_result = qr_nominal_result,\n", - " twas_weight = twas_weight,\n", - " pseudo_R2 = pseudo_R2,\n", - " quantile_twas_prediction = quantile_twas_prediction\n", - " ) \n", "\n", - " condition_names <- c(condition_names, new_names)\n", + " if (is.null(names(fdat$residual_Y)[r])) {\n", + " names(fdat$residual_Y)[r] = extract_region_name[[r]] \n", + " } \n", + " \n", + " new_names = names(fdat$residual_Y)[r]\n", + " new_col_names = list(${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])})[[r]]\n", + " if (is.null(new_col_names)) {\n", + " new_col_names = 1:ncol(fdat$residual_Y[[r]])\n", " }\n", + " if (!(identical(new_names, new_col_names))) {\n", + " new_names = paste(new_names, new_col_names, sep = \"_\")\n", + " } \n", "\n", - " # original data no longer relevant, set to NA to release memory\n", + " fitted[[r]] <- qr_results\n", + " \n", + " if (!is.null(qr_results$message)) {\n", + " message(qr_results$message)\n", + " }\n", + " \n", + " condition_names <- c(condition_names, new_names)\n", + " \n", + " # Release memory\n", " fdat$residual_X[[r]] <- NA\n", - " fdat$residual_Y[[r]] <- NA\n", + " fdat$residual_Y[[r]] <- NA \n", " r = r + 1\n", " } \n", "\n", diff --git a/code/pecotmr_integration/twas.ipynb b/code/pecotmr_integration/twas.ipynb index 7d759c31..35e09a4f 100644 --- a/code/pecotmr_integration/twas.ipynb +++ b/code/pecotmr_integration/twas.ipynb @@ -427,14 +427,12 @@ "# Threashold for rsq and pvalue for imputability determination for a model \n", "parameter: p_value_cutoff = 0.05\n", "parameter: rsq_threshold = 0.01\n", - "parameter: save_weights_db = True\n", + "parameter: mr_pval_cutoff = 0.05\n", "parameter: save_ctwas_data = True\n", "parameter: save_mr_result = True\n", "\n", "input: filtered_regional_xqtl_files, group_by = lambda x: group_by_region(x, filtered_regional_xqtl_files), group_with = \"filtered_region_info\"\n", "output_files = [f'{cwd:a}/{step_name}/{name}.{_filtered_region_info[3]}.twas.tsv.gz']\n", - "if save_weights_db: \n", - " output_files.append(f'{cwd:a}/{step_name}/{name}.{_filtered_region_info[3]}.merged_twas_weights.rds')\n", "if save_ctwas_data:\n", " output_files.append(f'{cwd:a}/{step_name}/{name}.{_filtered_region_info[3]}.twas_data.rds')\n", "if save_mr_result:\n", @@ -447,9 +445,6 @@ " library(data.table)\n", " library(pecotmr)\n", " library(readr)\n", - " \n", - " region_block <- unlist(strsplit(\"${_filtered_region_info[3]}\", \"_\\\\s*\"))\n", - " chrom <- as.integer(gsub(\"chr\", \"\", region_block[1]))\n", "\n", " # get xQTL weight information \n", " xqtl_meta_df <- fread(\"${xqtl_meta_data}\") # Get related gene information from the xqtl_meta data table\n", @@ -472,28 +467,16 @@ " # Step 1: Load TWAS weights and generate twas weight db, output export_twas_weights_db \n", " for (gene_db in names(weight_db_list_update)) {\n", " weight_dbs <- weight_db_list_update[[gene_db]]\n", - " twas_weights_results[[gene_db]] = generate_twas_db(weight_dbs, contexts=NULL,\n", - " variable_name_obj=\"variant_names\", \n", + " twas_weights_results[[gene_db]] = load_twas_weights(weight_dbs, variable_name_obj=\"variant_names\", \n", " susie_obj=\"susie_weights_intermediate\",\n", - " twas_weights_table = \"twas_weights\", \n", - " min_rsq_threshold = ${rsq_threshold}, \n", - " p_val_cutoff = ${p_value_cutoff}, \n", - " data_type_table=xqtl_type_table, \n", - " region_name=gene_db)\n", - " if (gene_db %in% names(twas_weights_results)){\n", - " # merge and update export_twas_weights_db for imputable genes, merge if same gene's other context results are available \n", - " if (!is.null(twas_weights_results[[gene_db]])){\n", - " export_twas_weights_db[[\"${_filtered_region_info[3]}\"]][[gene_db]] <- c(export_twas_weights_db[[\"${_filtered_region_info[3]}\"]][[gene_db]],\n", - " twas_weights_results[[gene_db]]$export_twas_weights_db)\n", - " }\n", - " }\n", - " }\n", - " if (${\"TRUE\" if save_weights_db else \"FALSE\"}) {\n", - " saveRDS(export_twas_weights_db,\"${_output[0]:nnn}.merged_twas_weights.rds\", compress='xz')\n", + " twas_weights_table = \"twas_weights\")\n", + " twas_weights_results[[gene_db]]$data_type <- setNames(lapply(names(twas_weights_results[[gene_db]]$weights), function(context) xqtl_type_table$type[sapply(xqtl_type_table$context,\n", + " function(x) grepl(x, context))]), names(twas_weights_results[[gene_db]]$weights)) \n", " }\n", "\n", " # Step 2: twas analysis for imputable genes across contexts\n", - " twas_results_db <- twas_pipeline(twas_weights_results, \"${ld_meta_data}\", \"${gwas_meta_data}\", region_block=\"${_filtered_region_info[3]}\")\n", + " twas_results_db <- twas_pipeline(twas_weights_results, \"${ld_meta_data}\", \"${gwas_meta_data}\", region_block=\"${_filtered_region_info[3]}\",\n", + " min_rsq_threshold = ${rsq_threshold}, p_val_cutoff = ${p_value_cutoff}, mr_pval_cutoff = ${mr_pval_cutoff})\n", " fwrite(twas_results_db$twas_result, file = ${_output[0]:r}, sep = \"\\t\", compress = \"gzip\")\n", "\n", " # Step 3: reformat for follow up cTWAS analysis\n", @@ -501,7 +484,7 @@ " saveRDS(twas_results_db$twas_data, \"${_output[0]:nnn}.twas_data.rds\", compress='xz')\n", " }\n", " if (${\"TRUE\" if save_mr_result else \"FALSE\"}) {\n", - " fwrite(twas_results_db$mr_result, file = \"${_output[0]:nnn}.mr_result.tsv\", sep = \"\\t\", compress = \"gzip\")\n", + " fwrite(twas_results_db$mr_result, file = \"${_output[0]:nnn}.mr_result.tsv.gz\", sep = \"\\t\", compress = \"gzip\")\n", " }" ] }, @@ -521,12 +504,10 @@ "parameter: twas_weight_cutoff = 1e-5\n", "parameter: max_num_variants=2000\n", "\n", + "region_blocks = ', '.join([f\"'{region_info[3]}'\" for region_info in filtered_region_info])\n", "twas_output_files = [f'{cwd:a}/twas/{name}.{region_info[3]}.twas_data.rds' for region_info in filtered_region_info]\n", - "export_twas_weights_db = [f'{cwd:a}/twas/{name}.{region_info[3]}.merged_twas_weights.rds' for region_info in filtered_region_info]\n", - "export_twas_weights_db = ', '.join([f\"'{f}'\" for f in export_twas_weights_db])\n", - "\n", - "input: twas_output_files, group_with = \"export_twas_weights_db\"\n", - "output: f\"{cwd:a}/{step_name}/{name}.ctwas_fine_mapping.tsv\"\n", + "input: twas_output_files, group_by = \"all\"\n", + "output: f\"{cwd:a}/{step_name}/{name}.ctwas_results.rds\"\n", "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n", "R: expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container, entrypoint = entrypoint\n", "\n", @@ -534,14 +515,14 @@ " library(ctwas) # multigroup_ctwas\n", " library(pecotmr)\n", "\n", + " # load genome-wide data across all regions\n", " regions_data <- get_ctwas_meta_data(\"${ld_meta_data}\", \"${regions}\", \"${xqtl_meta_data}\")\n", " gwas_studies <- unique(fread(\"${gwas_meta_data}\",data.table=FALSE, select = \"study_id\"))[,1]\n", - "\n", - " fine_mapping_db <- do.call(c, lapply(c(${export_twas_weights_db}), readRDS))\n", " data <- lapply(c(${_input:r,}), readRDS)\n", - " names(data) <- names(fine_mapping_db)\n", - "\n", - " snp_info <- do.call(c, lapply(data, function(x) x$snp_info))\n", + " names(data) <- c(${region_blocks})\n", + " \n", + " # compile z_snp, z_gene, and snp_info across multiple regions\n", + " snp_info <- do.call(c, lapply(unname(data), function(x) x$snp_info))\n", " snp_info <- snp_info[!duplicated(names(snp_info))]\n", " z_gene_dat <- do.call(c, lapply(unname(data), function(x)x$z_gene))\n", " z_snp_dat <- do.call(c, lapply(unname(data), function(x)x$z_snp))\n", @@ -552,33 +533,38 @@ " df[!duplicated(df$id), ] \n", " }), gwas_studies)\n", "\n", + " # select weight variants based on minimum twas wieghts, cs, and pip\n", " weights <- do.call(c, lapply(names(data), function(region_block){\n", - " trim_ctwas_variants(region_block, data[[region_block]], fine_mapping_db[[region_block]], twas_weight_cutoff=${twas_weight_cutoff}, cs_min_cor=0.8,\n", + " trim_ctwas_variants(data[[region_block]], twas_weight_cutoff=${twas_weight_cutoff}, cs_min_cor=0.8,\n", " min_pip_cutoff=0.1, max_num_variants=${max_num_variants})\n", " }))\n", "\n", - " ctwas_res <- list()\n", - " for (study in gwas_studies){\n", - " ctwas_res[[study]] <- ctwas_sumstats(z_snp[[study]],\n", - " weights,\n", - " region_info=regions_data$region_info,\n", - " LD_map=regions_data$LD_info,\n", - " snp_map=snp_info,\n", - " z_gene = z_gene[[study]],\n", - " thin = 0.1,\n", - " ncore = ${numThreads},\n", - " outputdir = ${_output[0]:dr},\n", - " outname = \"${name}\",\n", - " logfile = file.path(${_output[0]:dr}, paste0(\"${name}\", \".ctwas_sumstats.log\")),\n", - " verbose = FALSE, \n", - " cor_dir = NULL,\n", - " save_cor = FALSE,\n", - " group_prior_var_structure = \"shared_nonSNP\",\n", - " LD_format=\"custom\", \n", - " LD_loader_fun = ctwas_ld_loader,\n", - " snpinfo_loader_fun = ctwas_bimfile_loader)\n", - " }\n", - "\n" + " # ctwas main function for all studies \n", + " ctwas_res <- do.call(c, lapply(gwas_studies, function(study){\n", + " st <- proc.time() \n", + " res <- ctwas_sumstats(z_snp[[study]],\n", + " weights,\n", + " region_info=regions_data$region_info,\n", + " LD_map=regions_data$LD_info,\n", + " snp_map=snp_info,\n", + " z_gene = z_gene[[study]],\n", + " thin = 0.1,\n", + " ncore = ${numThreads},\n", + " outputdir = ${_output[0]:dr},\n", + " outname = \"${name}\",\n", + " logfile = file.path(${_output[0]:dr}, paste0(\"${name}\", \".ctwas_sumstats.log\")),\n", + " verbose = FALSE, \n", + " cor_dir = NULL,\n", + " save_cor = FALSE,\n", + " group_prior_var_structure = \"shared_nonSNP\",\n", + " LD_format=\"custom\", \n", + " LD_loader_fun = ctwas_ld_loader,\n", + " snpinfo_loader_fun = ctwas_bimfile_loader)\n", + " res$total_time_elapsed <- proc.time() - st\n", + " return(res)\n", + " }))\n", + " names(ctwas_res) <- gwas_studies\n", + " saveRDS(ctwas_res, ${_output:r})\n" ] } ],