From 471b02ed76ac843247b80d4feba906b04ceebf49 Mon Sep 17 00:00:00 2001 From: rl3328 Date: Mon, 30 Sep 2024 21:35:49 +0000 Subject: [PATCH 1/4] update QC for all steps except for gEBMF --- ...rmated.rename_event.bed.imputed.bed.stderr | 18 ++++++++++++++ ...rmated.rename_event.bed.imputed.bed.stdout | 0 ...rename_event.bed_subset.imputed.bed.stderr | 24 +++++++++++++++++++ ...rename_event.bed_subset.imputed.bed.stdout | 5 ++++ 4 files changed, 47 insertions(+) create mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr create mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout create mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr create mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr new file mode 100644 index 000000000..f2d98f826 --- /dev/null +++ b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr @@ -0,0 +1,18 @@ +Loading required package: ebnm +Loading required package: magrittr + +Attaching package: ‘dplyr’ + +The following objects are masked from ‘package:stats’: + + filter, lag + +The following objects are masked from ‘package:base’: + + intersect, setdiff, setequal, union + +Loading required package: Matrix +Loaded softImpute 1.4-1 + + +Execution halted diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout new file mode 100644 index 000000000..e69de29bb diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr new file mode 100644 index 000000000..b0f41d78a --- /dev/null +++ b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr @@ -0,0 +1,24 @@ +Loading required package: ebnm +Loading required package: magrittr + +Attaching package: ‘dplyr’ + +The following objects are masked from ‘package:stats’: + + filter, lag + +The following objects are masked from ‘package:base’: + + intersect, setdiff, setequal, union + +Loading required package: Matrix +Loaded softImpute 1.4-1 + +Rows: 9999 Columns: 1145 +── Column specification ──────────────────────────────────────────────────────── +Delimiter: "\t" +chr (2): #chr, ID +dbl (1143): start, end, 01_120405, 02_120405, 03_120405, 04_120405, 05_12040... + +ℹ Use `spec()` to retrieve the full column specification for this data. +ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message. diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout new file mode 100644 index 000000000..e3a28b284 --- /dev/null +++ b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout @@ -0,0 +1,5 @@ +Backfitting 60 factors (tolerance: 6.55e-02)... + Difference between iterations is within 1.0e+04... + Difference between iterations is within 1.0e+03... + Difference between iterations is within 1.0e+02... + Difference between iterations is within 1.0e+01... From e200dabba781f70582bbf2b6811934bf63b11fe4 Mon Sep 17 00:00:00 2001 From: rl3328 Date: Mon, 30 Sep 2024 23:00:21 +0000 Subject: [PATCH 2/4] gEBMF not resolved --- ...rmated.rename_event.bed.imputed.bed.stderr | 18 -- ...rmated.rename_event.bed.imputed.bed.stdout | 0 ...rename_event.bed_subset.imputed.bed.stderr | 24 --- ...rename_event.bed_subset.imputed.bed.stdout | 5 - .../phenotype/phenotype_imputation.ipynb | 159 ++++++++++++--- code/data_preprocessing/phenotype/xgb_imp.R | 183 ++++++++++++++++++ 6 files changed, 319 insertions(+), 70 deletions(-) delete mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr delete mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout delete mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr delete mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout create mode 100644 code/data_preprocessing/phenotype/xgb_imp.R diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr deleted file mode 100644 index f2d98f826..000000000 --- a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr +++ /dev/null @@ -1,18 +0,0 @@ -Loading required package: ebnm -Loading required package: magrittr - -Attaching package: ‘dplyr’ - -The following objects are masked from ‘package:stats’: - - filter, lag - -The following objects are masked from ‘package:base’: - - intersect, setdiff, setequal, union - -Loading required package: Matrix -Loaded softImpute 1.4-1 - - -Execution halted diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout deleted file mode 100644 index e69de29bb..000000000 diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr deleted file mode 100644 index b0f41d78a..000000000 --- a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr +++ /dev/null @@ -1,24 +0,0 @@ -Loading required package: ebnm -Loading required package: magrittr - -Attaching package: ‘dplyr’ - -The following objects are masked from ‘package:stats’: - - filter, lag - -The following objects are masked from ‘package:base’: - - intersect, setdiff, setequal, union - -Loading required package: Matrix -Loaded softImpute 1.4-1 - -Rows: 9999 Columns: 1145 -── Column specification ──────────────────────────────────────────────────────── -Delimiter: "\t" -chr (2): #chr, ID -dbl (1143): start, end, 01_120405, 02_120405, 03_120405, 04_120405, 05_12040... - -ℹ Use `spec()` to retrieve the full column specification for this data. -ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message. diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout deleted file mode 100644 index e3a28b284..000000000 --- a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout +++ /dev/null @@ -1,5 +0,0 @@ -Backfitting 60 factors (tolerance: 6.55e-02)... - Difference between iterations is within 1.0e+04... - Difference between iterations is within 1.0e+03... - Difference between iterations is within 1.0e+02... - Difference between iterations is within 1.0e+01... diff --git a/code/data_preprocessing/phenotype/phenotype_imputation.ipynb b/code/data_preprocessing/phenotype/phenotype_imputation.ipynb index e1d7f1a57..8aded6ce1 100644 --- a/code/data_preprocessing/phenotype/phenotype_imputation.ipynb +++ b/code/data_preprocessing/phenotype/phenotype_imputation.ipynb @@ -178,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "9073376b-b7e5-46d6-ae70-54229fa72453", "metadata": { "kernel": "SoS" @@ -211,6 +211,8 @@ " Work directory & output directory\n", " --phenoFile VAL (as path, required)\n", " Molecular phenotype matrix\n", + " --[no-]qc-prior-to-impute (default to True)\n", + " QC before imputation to remove unqualified features\n", " --job-size 1 (as int)\n", " For cluster jobs, number commands to run per job\n", " --walltime 72h\n", @@ -285,6 +287,8 @@ "parameter: cwd = path(\"output\")\n", "# Molecular phenotype matrix\n", "parameter: phenoFile = path\n", + "# QC before imputation to remove unqualified features \n", + "parameter: qc_prior_to_impute = True\n", "# For cluster jobs, number commands to run per job\n", "parameter: job_size = 1\n", "# Wall clock time expected\n", @@ -312,7 +316,6 @@ "parameter: prior = \"ebnm_point_laplace\"\n", "parameter: varType = '1'\n", "parameter: num_factor = '60'\n", - "parameter: qc_prior_to_impute = True\n", "input: phenoFile\n", "output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n", "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}', cores = numThreads \n", @@ -327,8 +330,8 @@ " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n", " # Get index of features that have > 40% missing \n", " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n", - " # Get index of features where all values are zero.\n", - " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)]))) == (ncol(pheno) -4))\n", + " # Get index of features where more than 95% values are zero.\n", + " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n", " miss.ind <- c(miss.ind, value0.ind)\n", " # remove these rows if qc_prior_to_impute = True\n", " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n", @@ -417,7 +420,22 @@ " library(\"softImpute\")\n", " \n", " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n", - " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " # Get index of features that have > 40% missing \n", + " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n", + " # Get index of features where more than 95% values are zero.\n", + " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n", + " miss.ind <- c(miss.ind, value0.ind)\n", + " # remove these rows if qc_prior_to_impute = True\n", + " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n", + " if (length(miss.ind) > 0) {\n", + " pheno_qc <- pheno[-miss.ind, ]\n", + " } else {\n", + " pheno_qc <- pheno\n", + " }\n", + " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n", + " } else {\n", + " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " }\n", " pheno_range <- range(unlist(as.list(as.data.frame(pheno_NAs))), na.rm = TRUE) \n", " if (pheno_range[1] >= 0 && pheno_range[2] <= 1) {\n", " dat <- qnorm(as.matrix(pheno_NAs))\n", @@ -452,7 +470,7 @@ " pheno_imp <- x_imp\n", " }\n", " \n", - " write_delim(cbind(pheno[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\")\n", + " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\")\n", "\n", "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n", " bgzip -f ${_output:n}\n", @@ -488,11 +506,28 @@ " library(\"tibble\")\n", " library(\"readr\")\n", " library(\"dplyr\")\n", - " \n", + " library(\"missForest\")\n", + "\n", " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n", - " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", - " pheno_imp <- missForest(as.matrix(pheno_NAs), parallelize = 'variables')$ximp\n", - " write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n", + " # Get index of features that have > 40% missing \n", + " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n", + " # Get index of features where more than 95% values are zero.\n", + " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n", + " miss.ind <- c(miss.ind, value0.ind)\n", + " # remove these rows if qc_prior_to_impute = True\n", + " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n", + " if (length(miss.ind) > 0) {\n", + " pheno_qc <- pheno[-miss.ind, ]\n", + " } else {\n", + " pheno_qc <- pheno\n", + " }\n", + " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n", + " } else {\n", + " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " }\n", + " pheno_imp <- missForest(as.matrix(pheno_NAs), parallelize = 'no')$ximp\n", + " # If ’variables’ the data is split into pieces of the size equal to the number of cores registered in the parallel backend.\n", + " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n", "\n", "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n", " bgzip -f ${_output:n}\n", @@ -524,15 +559,31 @@ "output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n", "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}' \n", "R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n", - " source('/mnt/vast/hpc/csg/zq2209/xgb_imp.R')\n", + " source('./xgb_imp.R')\n", " library(\"tibble\")\n", " library(\"readr\")\n", " library(\"dplyr\")\n", - " \n", + " library(\"missForest\")\n", + "\n", " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n", - " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " # Get index of features that have > 40% missing \n", + " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n", + " # Get index of features where more than 95% values are zero.\n", + " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n", + " miss.ind <- c(miss.ind, value0.ind)\n", + " # remove these rows if qc_prior_to_impute = True\n", + " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n", + " if (length(miss.ind) > 0) {\n", + " pheno_qc <- pheno[-miss.ind, ]\n", + " } else {\n", + " pheno_qc <- pheno\n", + " }\n", + " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n", + " } else {\n", + " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " }\n", " pheno_imp <- xgboost_imputation(as.matrix(pheno_NAs))\n", - " write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n", + " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n", "\n", "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n", " bgzip -f ${_output:n}\n", @@ -569,9 +620,26 @@ " library(\"dplyr\")\n", " \n", " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n", - " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " # Get index of features that have > 40% missing \n", + " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n", + " # Get index of features where more than 95% values are zero.\n", + " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n", + " miss.ind <- c(miss.ind, value0.ind)\n", + " # remove these rows if qc_prior_to_impute = True\n", + " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n", + " if (length(miss.ind) > 0) {\n", + " pheno_qc <- pheno[-miss.ind, ]\n", + " } else {\n", + " pheno_qc <- pheno\n", + " }\n", + " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n", + " } else {\n", + " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " }\n", + " pheno_NAs = pheno_NAs %>% select(where(~mean(is.na(.)) < 0.8))\n", + " # I removed columns that have more than 80 % missing values, cuz knn cannot impute them. \n", " pheno_imp <- impute.knn(as.matrix(pheno_NAs), rowmax = 1)$data\n", - " write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n", + " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n", "\n", "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n", " bgzip -f ${_output:n}\n", @@ -608,12 +676,27 @@ " library(\"dplyr\")\n", " \n", " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n", - " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " # Get index of features that have > 40% missing \n", + " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n", + " # Get index of features where more than 95% values are zero.\n", + " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n", + " miss.ind <- c(miss.ind, value0.ind)\n", + " # remove these rows if qc_prior_to_impute = True\n", + " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n", + " if (length(miss.ind) > 0) {\n", + " pheno_qc <- pheno[-miss.ind, ]\n", + " } else {\n", + " pheno_qc <- pheno\n", + " }\n", + " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n", + " } else {\n", + " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " }\n", " X_mis_C <- as(as.matrix(pheno_NAs), \"Incomplete\")\n", " ###uses \"svd\" algorithm\n", " fit <- softImpute(X_mis_C,rank = 50,lambda = 30,type = \"svd\")\n", " pheno_imp <- complete(as.matrix(pheno_NAs),fit)\n", - " write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n", + " write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n", "\n", "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n", " bgzip -f ${_output:n}\n", @@ -649,12 +732,27 @@ " library(\"dplyr\")\n", " \n", " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n", - " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " # Get index of features that have > 40% missing \n", + " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n", + " # Get index of features where more than 95% values are zero.\n", + " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n", + " miss.ind <- c(miss.ind, value0.ind)\n", + " # remove these rows if qc_prior_to_impute = True\n", + " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n", + " if (length(miss.ind) > 0) {\n", + " pheno_qc <- pheno[-miss.ind, ]\n", + " } else {\n", + " pheno_qc <- pheno\n", + " }\n", + " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n", + " } else {\n", + " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " }\n", " Yfill <- as.matrix(pheno_NAs)\n", " for (t.row in 1:nrow(pheno_NAs)) {\n", " Yfill[t.row, is.na(Yfill[t.row,])] <- rowMeans(Yfill, na.rm = TRUE)[t.row] \n", " } \n", - " write_delim(cbind(pheno[, 1:4], Yfill), ${_output:r}, delim = \"\\t\" )\n", + " write_delim(cbind(pheno_qc[, 1:4], Yfill), \"${_output:n}\", delim = \"\\t\" )\n", "\n", "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n", " bgzip -f ${_output:n}\n", @@ -690,12 +788,27 @@ " library(\"dplyr\")\n", " \n", " pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n", - " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " # Get index of features that have > 40% missing \n", + " miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n", + " # Get index of features where more than 95% values are zero.\n", + " value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n", + " miss.ind <- c(miss.ind, value0.ind)\n", + " # remove these rows if qc_prior_to_impute = True\n", + " if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n", + " if (length(miss.ind) > 0) {\n", + " pheno_qc <- pheno[-miss.ind, ]\n", + " } else {\n", + " pheno_qc <- pheno\n", + " }\n", + " pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n", + " } else {\n", + " pheno_NAs <- pheno[, 5:ncol(pheno)]\n", + " }\n", " Yfill <- as.matrix(pheno_NAs)\n", " for (t.row in 1:nrow(pheno_NAs)) {\n", " Yfill[t.row, is.na(Yfill[t.row,])] <- min(Yfill[t.row, ], na.rm = TRUE)\n", " }\n", - " write_delim(cbind(pheno[, 1:4], Yfill), ${_output:r}, delim = \"\\t\" )\n", + " write_delim(cbind(pheno_qc[, 1:4], Yfill), \"${_output:n}\", delim = \"\\t\" )\n", "\n", "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n", " bgzip -f ${_output:n}\n", @@ -732,7 +845,7 @@ "# with missing rate smaller than tol_missing will be mean_imputed. Say if we want to keep rows with less than 5% missing, then we use 0.05 as tol_missing.\n", "parameter: tol_missing = 0.05\n", "input: phenoFile\n", - "output: f'{_input:nn}.filter_na.{impute_method}.bed.gz'\n", + "output: f'{cwd:a}/{_input:nn}.filter_na.{impute_method}.bed.gz'\n", "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads\n", "R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n", " library(\"dplyr\")\n", diff --git a/code/data_preprocessing/phenotype/xgb_imp.R b/code/data_preprocessing/phenotype/xgb_imp.R new file mode 100644 index 000000000..57dc1be98 --- /dev/null +++ b/code/data_preprocessing/phenotype/xgb_imp.R @@ -0,0 +1,183 @@ +#options(repos = c(CRAN = "https://cloud.r-project.org/")) +#install.packages("xgboost") +library(xgboost) +#library(caret) +#library(tidyverse) +library(doParallel) +library(doRNG) +library(itertools) +numCores <- parallel::detectCores() +#cl <- makeCluster(5) +registerDoParallel(numCores - 1) + +#source('./SimulationDesign1/constants.R') +#source('./SimulationDesign1/simulateData.R') +#Y_sim <- simulateData()$Y +#Y_mis <- generate_missing(Y_sim) + +xgboost_imputation <- function(data, maxiter = 10, verbose = TRUE, max.depth = 2, nrounds = 50, decreasing = FALSE, parallelize = 'variables') { + xmis <- data + #xmis <- Y_mis + n <- nrow(xmis) + p <- ncol(xmis) + + # Remove complete missing columns + if (any(apply(is.na(xmis), 2, sum) == n)){ + indCmis <- which(apply(is.na(xmis), 2, sum) == n) + xmis <- xmis[,-indCmis] + p <- ncol(xmis) + cat('removed variable(s)', indCmis, + 'due to the missingness of all entries\n') + } + + # Initial mean imputation + ximp <- xmis + varType <- character(p) + for (t.co in 1:p) { + ximp[is.na(xmis[,t.co]),t.co] <- colMeans(xmis, na.rm = TRUE)[t.co] #mean(xmis[,1], na.rm = TRUE) + varType[t.co] <- 'numeric' + #if (is.numeric(xmis[[t.co]])) { + # varType[t.co] <- 'numeric' + #ximp[is.na(xmis[,t.co]),t.co] <- mean(xmis[,t.co], na.rm = TRUE) + #next() + #} + } + + # Extract missing location + NAloc <- is.na(xmis) # where are missings + noNAvar <- apply(NAloc, 2, sum) # how many are missing in the vars + sort.j <- order(noNAvar) # indices of increasing amount of NA in vars + #decreasing = FALSE + if (decreasing) + sort.j <- rev(sort.j) + sort.noNAvar <- noNAvar[sort.j] + + # Extract the columns to be imputed + nzsort.j <- sort.j[sort.noNAvar > 0] + #parallelize <- 'variables' + if (parallelize == 'variables') { + '%cols%' <- get('%dorng%') + idxList <- as.list(isplitVector(nzsort.j, chunkSize = getDoParWorkers())) + } + # print("idxList") + # print(idxList) + + ## initialize parameters of interest + iter <- 0 + k <- length(unique(varType)) + convNew <- rep(0, k) + names(convNew) <- c('numeric') + convOld <- rep(Inf, k) + convergence <- c() + #OOBerror <- numeric(p) + #names(OOBerror) <- varType + + #maxiter <- 10 + Ximp <- vector('list', maxiter) + + # Stop criteria + stopCriterion <- function(varType, convNew, convOld, iter, maxiter){ + k <- length(unique(varType)) + if (k == 1){ + (convNew < convOld) & (iter < maxiter) + } else { + ((convNew[1] < convOld[1]) | (convNew[2] < convOld[2])) & (iter < maxiter) + } + } + + # Main Imputation Function + while (stopCriterion(varType, convNew, convOld, iter, maxiter)) { + if (iter != 0) { + convOld <- convNew + # OOBerrOld <- OOBerr + } + if (verbose) { + cat(" XGBoost iteration", iter+1, "in progress...") + } + t.start <- proc.time() + ximp.old <- ximp + + for (idx in idxList) { + results <- foreach(varInd = idx, .packages = 'xgboost') %cols% { + obsi <- !NAloc[, varInd] # which i's are observed + misi <- NAloc[, varInd] # which i's are missing + print(paste("Dimensions of obsi:", dim(obsi))) + print(paste("Dimensions of misi:", dim(misi))) + obsY <- ximp[obsi, varInd, drop = FALSE] # training response + misY <- ximp[misi, varInd, drop = FALSE] # testing response + obsX <- ximp[obsi, seq(1, p)[-varInd], drop = FALSE] # training variables + misX <- ximp[misi, seq(1, p)[-varInd], drop = FALSE] # prediction variables + print(paste("Dimensions of obsX:", dim(obsX))) + print(paste("Dimensions of misX:", dim(misX))) + print(paste("Dimensions of misY:", dim(misY))) + print(paste("Dimensions of obsY:", dim(obsY))) + # print("obsX:") + # dim(obsX) + # # print(head(obsX)) + # print("obsY:") + # dim(obsY) + # print(head(obsY)) + # print("misX:") + # print(head(misX)) + # print("misY:") + # print(head(misY)) + # xgb_obsX = xgb.DMatrix(data = as.matrix(obsX), label = as.matrix(obsY)) # xgboost version of train + xgb_obsX <- tryCatch({ + xgb.DMatrix(data = as.matrix(obsX), label = as.matrix(obsY)) + }, error = function(e) { + print(paste("Error in creating xgb_obsX:", e$message)) + print(paste("Dimensions of obsX:", dim(obsX))) + print(paste("Dimensions of obsY:", dim(obsY))) + NULL + }) + xgb_misX = xgb.DMatrix(data = as.matrix(misX), label = as.matrix(misY))# misX contains the rows where the current variable (varInd) is missing. # misY also contains these missing values, which you're trying to predict. XGBoost doesn't expect labels for the data you're trying to predict. + if (inherits(xgb_misX, "try-error")) { + print("Error in creating xgb_misX") + print(xgb_misX) + } + #typeY <- varType[varInd] + print("xgb_obs:") + # print(head(xgb_obsX)) + # print("xgb_mis:") + # print(head(xgb_misX)) + # xgboost on train + xgb <- xgboost(data = xgb_obsX, max.depth = max.depth, nrounds = nrounds) + + misY <- predict(xgb, xgb_misX) ## predict missing values in column varInd + + list(varInd = varInd, misY = misY) #oerr = oerr + } + + for (res in results) { + misi <- NAloc[,res$varInd] + ximp[misi, res$varInd] <- res$misY + #OOBerror[res$varInd] <- res$oerr + } + } + + if (verbose){ + cat('done!\n') + } + + iter <- iter + 1 + Ximp[[iter]] <- ximp + + t.co2 <- 1 + ## check the difference between iteration steps + for (t.type in names(convNew)){ + t.ind <- which(varType == t.type) + convNew[t.co2] <- sum((ximp[, t.ind] - ximp.old[, t.ind])^2) / sum(ximp[, t.ind]^2) + t.co2 <- t.co2 + 1 + } + } + + # Extrtact the output + if (iter == maxiter) { + out <- Ximp[[iter]] + } else { + out <- Ximp[[iter - 1]] + } + + return(out) +} + From 8f7d25001ccfffb79724cc27fec9fe66875fdaab Mon Sep 17 00:00:00 2001 From: rl3328 Date: Mon, 30 Sep 2024 23:11:31 -0400 Subject: [PATCH 3/4] Update containers.csv --- container/containers.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/container/containers.csv b/container/containers.csv index 3ec20add3..99abd014c 100644 --- a/container/containers.csv +++ b/container/containers.csv @@ -73,7 +73,7 @@ trimmomatic,https://github.com/timflutre/trimmomatic,Java,0.39,N/A,bioconda,rna_ gtex-pipeline-qtl,https://github.com/broadinstitute/gtex-pipeline/tree/master/qtl,Python,gtex_v8,bed31e12a48b25db8b111252e2136b31c64c9e82,dnachun,rna_quantification,Other,https://anaconda.org/dnachun/gtex-pipeline-qtl, pandas,https://pypi.org/project/pandas/,Python,1.3.5,N/A,conda-forge,"rna_quantification, tensorqtl, quantqtl",BSD-3-Clause,https://anaconda.org/conda-forge/pandas, r-mvsusier,https://github.com/stephenslab/mvsusieR,R,0.1.7,N/A,dnachun,pecotmr,GPL-3.0-or-later,https://anaconda.org/dnachun/r-mvsusier, -r-pecotmr,https://github.com/cumc/pecotmr,R,0.1.70,N/A,dnachun,pecotmr,GPL-3.0-or-later,https://anaconda.org/dnachun/r-pecotmr, +r-pecotmr,https://github.com/cumc/pecotmr,R,0.1.71,N/A,dnachun,pecotmr,GPL-3.0-or-later,https://anaconda.org/dnachun/r-pecotmr, r-fsusier,https://github.com/stephenslab/fsusier,R,0.2.0,dd4df3668d2ca59dd34a52b6c1b426054a998fbd,dnachun,pecotmr,GPL-3.0-or-later,https://anaconda.org/dnachun/r-fsusier, backports.zoneinfo,https://pypi.org/project/backports.zoneinfo/,Python,0.2.1,N/A,conda-forge,"pecotmr, quantqtl",Apache-2.0,https://anaconda.org/conda-forge/backports.zoneinfo, bioconductor-variantannotation,https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html,R,1.48.1,N/A,bioconda,"pecotmr, quantqtl",Artistic-2.0,https://anaconda.org/bioconda/bioconductor-variantannotation, @@ -134,4 +134,4 @@ bioconductor-edger,https://bioconductor.org/packages/3.18/bioc/html/edgeR.html,R bioconductor-limma,https://bioconductor.org/packages/3.18/bioc/html/limma.html,R,3.58.1,N/A,bioconda,seurat,GPL-2.0-or-later,https://anaconda.org/bioconda/bioconductor-limma, r-profvis,https://rstudio.github.io/profvis/,R,,N/A,conda-forge,quantqtl,GPL-3.0-or-later,https://anaconda.org/conda-forge/r-profvis, r-bigsnpr,https://privefl.github.io/bigsnpr/,R,1.12.2,N/A,conda-forge,pecotmr,GPL-3.0-or-later,https://anaconda.org/conda-forge/r-bigsnpr, -quantas,https://github.com/chaolinzhanglab/quantas,Perl,1.1.2,N/A,dnachun,quantas,Other,https://anaconda.org/dnachun/quantas, \ No newline at end of file +quantas,https://github.com/chaolinzhanglab/quantas,Perl,1.1.2,N/A,dnachun,quantas,Other,https://anaconda.org/dnachun/quantas, From 657b110aa74bbabf9291eeb9236e58ac5e79096b Mon Sep 17 00:00:00 2001 From: rl3328 Date: Tue, 1 Oct 2024 03:12:01 +0000 Subject: [PATCH 4/4] Update YAML files and Markdown table --- container/README.md | 2 +- container/pecotmr/pecotmr.yml | 2 +- container/readme/containers.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/container/README.md b/container/README.md index 452b022f1..f6549fa90 100644 --- a/container/README.md +++ b/container/README.md @@ -99,7 +99,7 @@ On occasion, you may see errors that a download or upload has failed while build | r-bigsnpr | [Link](https://privefl.github.io/bigsnpr/) | [Link](https://anaconda.org/conda-forge/r-bigsnpr) | pecotmr | 1.12.2 | conda‑forge | GPL‑3.0‑or‑later | | r-fsusier | [Link](https://github.com/stephenslab/fsusier) | [Link](https://anaconda.org/dnachun/r-fsusier) | pecotmr | 0.2.0 | dnachun | GPL‑3.0‑or‑later | | r-mvsusier | [Link](https://github.com/stephenslab/mvsusieR) | [Link](https://anaconda.org/dnachun/r-mvsusier) | pecotmr | 0.1.7 | dnachun | GPL‑3.0‑or‑later | -| r-pecotmr | [Link](https://github.com/cumc/pecotmr) | [Link](https://anaconda.org/dnachun/r-pecotmr) | pecotmr | 0.1.70 | dnachun | GPL‑3.0‑or‑later | +| r-pecotmr | [Link](https://github.com/cumc/pecotmr) | [Link](https://anaconda.org/dnachun/r-pecotmr) | pecotmr | 0.1.71 | dnachun | GPL‑3.0‑or‑later | | r-susier | [Link](https://github.com/stephenslab/susieR) | [Link](https://anaconda.org/conda-forge/r-susier) | pecotmr, fastenloc | 0.14.7 | conda‑forge | MIT | | backports.zoneinfo | [Link](https://pypi.org/project/backports.zoneinfo/) | [Link](https://anaconda.org/conda-forge/backports.zoneinfo) | pecotmr, quantqtl | 0.2.1 | conda‑forge | Apache‑2.0 | | bioconductor-variantannotation | [Link](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) | [Link](https://anaconda.org/bioconda/bioconductor-variantannotation) | pecotmr, quantqtl | 1.48.1 | bioconda | Artistic‑2.0 | diff --git a/container/pecotmr/pecotmr.yml b/container/pecotmr/pecotmr.yml index e44262dd1..cca0ce6bf 100644 --- a/container/pecotmr/pecotmr.yml +++ b/container/pecotmr/pecotmr.yml @@ -16,7 +16,7 @@ dependencies: - r-genio=1.1.2 - r-gwasrapidd=0.99.14 - r-mvsusier=0.1.7 - - r-pecotmr=0.1.70 + - r-pecotmr=0.1.71 - r-susier=0.14.7 - r-udr=0.3.154 - altair diff --git a/container/readme/containers.md b/container/readme/containers.md index 2f984bffb..625af59c2 100644 --- a/container/readme/containers.md +++ b/container/readme/containers.md @@ -48,7 +48,7 @@ | r-bigsnpr | [Link](https://privefl.github.io/bigsnpr/) | [Link](https://anaconda.org/conda-forge/r-bigsnpr) | pecotmr | 1.12.2 | conda‑forge | GPL‑3.0‑or‑later | | r-fsusier | [Link](https://github.com/stephenslab/fsusier) | [Link](https://anaconda.org/dnachun/r-fsusier) | pecotmr | 0.2.0 | dnachun | GPL‑3.0‑or‑later | | r-mvsusier | [Link](https://github.com/stephenslab/mvsusieR) | [Link](https://anaconda.org/dnachun/r-mvsusier) | pecotmr | 0.1.7 | dnachun | GPL‑3.0‑or‑later | -| r-pecotmr | [Link](https://github.com/cumc/pecotmr) | [Link](https://anaconda.org/dnachun/r-pecotmr) | pecotmr | 0.1.70 | dnachun | GPL‑3.0‑or‑later | +| r-pecotmr | [Link](https://github.com/cumc/pecotmr) | [Link](https://anaconda.org/dnachun/r-pecotmr) | pecotmr | 0.1.71 | dnachun | GPL‑3.0‑or‑later | | r-susier | [Link](https://github.com/stephenslab/susieR) | [Link](https://anaconda.org/conda-forge/r-susier) | pecotmr, fastenloc | 0.14.7 | conda‑forge | MIT | | backports.zoneinfo | [Link](https://pypi.org/project/backports.zoneinfo/) | [Link](https://anaconda.org/conda-forge/backports.zoneinfo) | pecotmr, quantqtl | 0.2.1 | conda‑forge | Apache‑2.0 | | bioconductor-variantannotation | [Link](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) | [Link](https://anaconda.org/bioconda/bioconductor-variantannotation) | pecotmr, quantqtl | 1.48.1 | bioconda | Artistic‑2.0 |