From 471b02ed76ac843247b80d4feba906b04ceebf49 Mon Sep 17 00:00:00 2001
From: rl3328 <rl3328@cumc.columbia.edu>
Date: Mon, 30 Sep 2024 21:35:49 +0000
Subject: [PATCH 1/4] update QC for all steps except for gEBMF

---
 ...rmated.rename_event.bed.imputed.bed.stderr | 18 ++++++++++++++
 ...rmated.rename_event.bed.imputed.bed.stdout |  0
 ...rename_event.bed_subset.imputed.bed.stderr | 24 +++++++++++++++++++
 ...rename_event.bed_subset.imputed.bed.stdout |  5 ++++
 4 files changed, 47 insertions(+)
 create mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr
 create mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout
 create mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr
 create mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout

diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr
new file mode 100644
index 000000000..f2d98f826
--- /dev/null
+++ b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr
@@ -0,0 +1,18 @@
+Loading required package: ebnm
+Loading required package: magrittr
+
+Attaching package: ‘dplyr’
+
+The following objects are masked from ‘package:stats’:
+
+    filter, lag
+
+The following objects are masked from ‘package:base’:
+
+    intersect, setdiff, setequal, union
+
+Loading required package: Matrix
+Loaded softImpute 1.4-1
+
+
+Execution halted
diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout
new file mode 100644
index 000000000..e69de29bb
diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr
new file mode 100644
index 000000000..b0f41d78a
--- /dev/null
+++ b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr
@@ -0,0 +1,24 @@
+Loading required package: ebnm
+Loading required package: magrittr
+
+Attaching package: ‘dplyr’
+
+The following objects are masked from ‘package:stats’:
+
+    filter, lag
+
+The following objects are masked from ‘package:base’:
+
+    intersect, setdiff, setequal, union
+
+Loading required package: Matrix
+Loaded softImpute 1.4-1
+
+Rows: 9999 Columns: 1145
+── Column specification ────────────────────────────────────────────────────────
+Delimiter: "\t"
+chr    (2): #chr, ID
+dbl (1143): start, end, 01_120405, 02_120405, 03_120405, 04_120405, 05_12040...
+
+ℹ Use `spec()` to retrieve the full column specification for this data.
+ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout
new file mode 100644
index 000000000..e3a28b284
--- /dev/null
+++ b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout
@@ -0,0 +1,5 @@
+Backfitting 60 factors (tolerance: 6.55e-02)...
+  Difference between iterations is within 1.0e+04...
+  Difference between iterations is within 1.0e+03...
+  Difference between iterations is within 1.0e+02...
+  Difference between iterations is within 1.0e+01...

From e200dabba781f70582bbf2b6811934bf63b11fe4 Mon Sep 17 00:00:00 2001
From: rl3328 <rl3328@cumc.columbia.edu>
Date: Mon, 30 Sep 2024 23:00:21 +0000
Subject: [PATCH 2/4] gEBMF not resolved

---
 ...rmated.rename_event.bed.imputed.bed.stderr |  18 --
 ...rmated.rename_event.bed.imputed.bed.stdout |   0
 ...rename_event.bed_subset.imputed.bed.stderr |  24 ---
 ...rename_event.bed_subset.imputed.bed.stdout |   5 -
 .../phenotype/phenotype_imputation.ipynb      | 159 ++++++++++++---
 code/data_preprocessing/phenotype/xgb_imp.R   | 183 ++++++++++++++++++
 6 files changed, 319 insertions(+), 70 deletions(-)
 delete mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr
 delete mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout
 delete mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr
 delete mode 100644 code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout
 create mode 100644 code/data_preprocessing/phenotype/xgb_imp.R

diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr
deleted file mode 100644
index f2d98f826..000000000
--- a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stderr
+++ /dev/null
@@ -1,18 +0,0 @@
-Loading required package: ebnm
-Loading required package: magrittr
-
-Attaching package: ‘dplyr’
-
-The following objects are masked from ‘package:stats’:
-
-    filter, lag
-
-The following objects are masked from ‘package:base’:
-
-    intersect, setdiff, setequal, union
-
-Loading required package: Matrix
-Loaded softImpute 1.4-1
-
-
-Execution halted
diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed.imputed.bed.stdout
deleted file mode 100644
index e69de29bb..000000000
diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr
deleted file mode 100644
index b0f41d78a..000000000
--- a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stderr
+++ /dev/null
@@ -1,24 +0,0 @@
-Loading required package: ebnm
-Loading required package: magrittr
-
-Attaching package: ‘dplyr’
-
-The following objects are masked from ‘package:stats’:
-
-    filter, lag
-
-The following objects are masked from ‘package:base’:
-
-    intersect, setdiff, setequal, union
-
-Loading required package: Matrix
-Loaded softImpute 1.4-1
-
-Rows: 9999 Columns: 1145
-── Column specification ────────────────────────────────────────────────────────
-Delimiter: "\t"
-chr    (2): #chr, ID
-dbl (1143): start, end, 01_120405, 02_120405, 03_120405, 04_120405, 05_12040...
-
-ℹ Use `spec()` to retrieve the full column specification for this data.
-ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diff --git a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout b/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout
deleted file mode 100644
index e3a28b284..000000000
--- a/code/data_preprocessing/phenotype/output/DLPFC_combined_SSU.formated.rename_event.bed_subset.imputed.bed.stdout
+++ /dev/null
@@ -1,5 +0,0 @@
-Backfitting 60 factors (tolerance: 6.55e-02)...
-  Difference between iterations is within 1.0e+04...
-  Difference between iterations is within 1.0e+03...
-  Difference between iterations is within 1.0e+02...
-  Difference between iterations is within 1.0e+01...
diff --git a/code/data_preprocessing/phenotype/phenotype_imputation.ipynb b/code/data_preprocessing/phenotype/phenotype_imputation.ipynb
index e1d7f1a57..8aded6ce1 100644
--- a/code/data_preprocessing/phenotype/phenotype_imputation.ipynb
+++ b/code/data_preprocessing/phenotype/phenotype_imputation.ipynb
@@ -178,7 +178,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "9073376b-b7e5-46d6-ae70-54229fa72453",
    "metadata": {
     "kernel": "SoS"
@@ -211,6 +211,8 @@
       "                        Work directory & output directory\n",
       "  --phenoFile VAL (as path, required)\n",
       "                        Molecular phenotype matrix\n",
+      "  --[no-]qc-prior-to-impute (default to True)\n",
+      "                        QC before imputation to remove unqualified features\n",
       "  --job-size 1 (as int)\n",
       "                        For cluster jobs, number commands to run per job\n",
       "  --walltime 72h\n",
@@ -285,6 +287,8 @@
     "parameter: cwd = path(\"output\")\n",
     "# Molecular phenotype matrix\n",
     "parameter: phenoFile = path\n",
+    "# QC before imputation to remove unqualified features \n",
+    "parameter: qc_prior_to_impute = True\n",
     "# For cluster jobs, number commands to run per job\n",
     "parameter: job_size = 1\n",
     "# Wall clock time expected\n",
@@ -312,7 +316,6 @@
     "parameter: prior = \"ebnm_point_laplace\"\n",
     "parameter: varType = '1'\n",
     "parameter: num_factor = '60'\n",
-    "parameter: qc_prior_to_impute = True\n",
     "input: phenoFile\n",
     "output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}', cores = numThreads  \n",
@@ -327,8 +330,8 @@
     "    pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
     "    # Get index of features that have > 40% missing \n",
     "    miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
-    "    # Get index of features where all values are zero.\n",
-    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)]))) == (ncol(pheno) -4))\n",
+    "    # Get index of features where more than 95% values are zero.\n",
+    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
     "    miss.ind <- c(miss.ind, value0.ind)\n",
     "    # remove these rows if qc_prior_to_impute = True\n",
     "    if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
@@ -417,7 +420,22 @@
     "   library(\"softImpute\")\n",
     "  \n",
     "    pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
-    "    pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    # Get index of features that have > 40% missing \n",
+    "    miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+    "    # Get index of features where more than 95% values are zero.\n",
+    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+    "    miss.ind <- c(miss.ind, value0.ind)\n",
+    "    # remove these rows if qc_prior_to_impute = True\n",
+    "    if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+    "      if (length(miss.ind) > 0) {\n",
+    "         pheno_qc <- pheno[-miss.ind, ]\n",
+    "       } else {\n",
+    "         pheno_qc <- pheno\n",
+    "       }\n",
+    "      pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+    "    } else {\n",
+    "      pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    }\n",
     "    pheno_range <- range(unlist(as.list(as.data.frame(pheno_NAs))), na.rm = TRUE) \n",
     "    if (pheno_range[1] >= 0 && pheno_range[2] <= 1) {\n",
     "        dat <- qnorm(as.matrix(pheno_NAs))\n",
@@ -452,7 +470,7 @@
     "        pheno_imp <- x_imp\n",
     "    }\n",
     "  \n",
-    "    write_delim(cbind(pheno[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\")\n",
+    "    write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\")\n",
     "\n",
     "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
     "    bgzip -f ${_output:n}\n",
@@ -488,11 +506,28 @@
     "   library(\"tibble\")\n",
     "   library(\"readr\")\n",
     "   library(\"dplyr\")\n",
-    "    \n",
+    "   library(\"missForest\")\n",
+    "\n",
     "    pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
-    "    pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
-    "    pheno_imp <- missForest(as.matrix(pheno_NAs), parallelize = 'variables')$ximp\n",
-    "    write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
+    "    # Get index of features that have > 40% missing \n",
+    "    miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+    "    # Get index of features where more than 95% values are zero.\n",
+    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+    "    miss.ind <- c(miss.ind, value0.ind)\n",
+    "    # remove these rows if qc_prior_to_impute = True\n",
+    "    if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+    "      if (length(miss.ind) > 0) {\n",
+    "         pheno_qc <- pheno[-miss.ind, ]\n",
+    "       } else {\n",
+    "         pheno_qc <- pheno\n",
+    "       }\n",
+    "      pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+    "    } else {\n",
+    "      pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    }\n",
+    "    pheno_imp <- missForest(as.matrix(pheno_NAs), parallelize = 'no')$ximp\n",
+    "    # If ’variables’ the data is split into pieces of the size equal to the number of cores registered in the parallel backend.\n",
+    "    write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
     "\n",
     "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
     "    bgzip -f ${_output:n}\n",
@@ -524,15 +559,31 @@
     "output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}'  \n",
     "R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n",
-    "   source('/mnt/vast/hpc/csg/zq2209/xgb_imp.R')\n",
+    "   source('./xgb_imp.R')\n",
     "   library(\"tibble\")\n",
     "   library(\"readr\")\n",
     "   library(\"dplyr\")\n",
-    "    \n",
+    "   library(\"missForest\")\n",
+    "\n",
     "    pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
-    "    pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    # Get index of features that have > 40% missing \n",
+    "    miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+    "    # Get index of features where more than 95% values are zero.\n",
+    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+    "    miss.ind <- c(miss.ind, value0.ind)\n",
+    "    # remove these rows if qc_prior_to_impute = True\n",
+    "    if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+    "      if (length(miss.ind) > 0) {\n",
+    "         pheno_qc <- pheno[-miss.ind, ]\n",
+    "       } else {\n",
+    "         pheno_qc <- pheno\n",
+    "       }\n",
+    "      pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+    "    } else {\n",
+    "      pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    }\n",
     "    pheno_imp <- xgboost_imputation(as.matrix(pheno_NAs))\n",
-    "    write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
+    "    write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
     "\n",
     "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
     "    bgzip -f ${_output:n}\n",
@@ -569,9 +620,26 @@
     "   library(\"dplyr\")\n",
     "    \n",
     "    pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
-    "    pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    # Get index of features that have > 40% missing \n",
+    "    miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+    "    # Get index of features where more than 95% values are zero.\n",
+    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+    "    miss.ind <- c(miss.ind, value0.ind)\n",
+    "    # remove these rows if qc_prior_to_impute = True\n",
+    "    if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+    "      if (length(miss.ind) > 0) {\n",
+    "         pheno_qc <- pheno[-miss.ind, ]\n",
+    "       } else {\n",
+    "         pheno_qc <- pheno\n",
+    "       }\n",
+    "      pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+    "    } else {\n",
+    "      pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    }\n",
+    "    pheno_NAs = pheno_NAs %>% select(where(~mean(is.na(.)) < 0.8))\n",
+    "    # I removed columns that have more than 80 % missing values, cuz knn cannot impute them. \n",
     "    pheno_imp <- impute.knn(as.matrix(pheno_NAs), rowmax = 1)$data\n",
-    "    write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
+    "    write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
     "\n",
     "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
     "    bgzip -f ${_output:n}\n",
@@ -608,12 +676,27 @@
     "   library(\"dplyr\")\n",
     "    \n",
     "    pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
-    "    pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    # Get index of features that have > 40% missing \n",
+    "    miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+    "    # Get index of features where more than 95% values are zero.\n",
+    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+    "    miss.ind <- c(miss.ind, value0.ind)\n",
+    "    # remove these rows if qc_prior_to_impute = True\n",
+    "    if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+    "      if (length(miss.ind) > 0) {\n",
+    "         pheno_qc <- pheno[-miss.ind, ]\n",
+    "       } else {\n",
+    "         pheno_qc <- pheno\n",
+    "       }\n",
+    "      pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+    "    } else {\n",
+    "      pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    }\n",
     "    X_mis_C <- as(as.matrix(pheno_NAs), \"Incomplete\")\n",
     "    ###uses \"svd\" algorithm\n",
     "    fit <- softImpute(X_mis_C,rank = 50,lambda = 30,type = \"svd\")\n",
     "    pheno_imp <- complete(as.matrix(pheno_NAs),fit)\n",
-    "    write_delim(cbind(pheno[, 1:4], pheno_imp), ${_output:r}, delim = \"\\t\" )\n",
+    "    write_delim(cbind(pheno_qc[, 1:4], pheno_imp), \"${_output:n}\", delim = \"\\t\" )\n",
     "\n",
     "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
     "    bgzip -f ${_output:n}\n",
@@ -649,12 +732,27 @@
     "   library(\"dplyr\")\n",
     "    \n",
     "    pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
-    "    pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    # Get index of features that have > 40% missing \n",
+    "    miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+    "    # Get index of features where more than 95% values are zero.\n",
+    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+    "    miss.ind <- c(miss.ind, value0.ind)\n",
+    "    # remove these rows if qc_prior_to_impute = True\n",
+    "    if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+    "      if (length(miss.ind) > 0) {\n",
+    "         pheno_qc <- pheno[-miss.ind, ]\n",
+    "       } else {\n",
+    "         pheno_qc <- pheno\n",
+    "       }\n",
+    "      pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+    "    } else {\n",
+    "      pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    }\n",
     "    Yfill <- as.matrix(pheno_NAs)\n",
     "    for (t.row in 1:nrow(pheno_NAs)) {\n",
     "        Yfill[t.row, is.na(Yfill[t.row,])] <- rowMeans(Yfill, na.rm = TRUE)[t.row] \n",
     "    }    \n",
-    "    write_delim(cbind(pheno[, 1:4], Yfill), ${_output:r}, delim = \"\\t\" )\n",
+    "    write_delim(cbind(pheno_qc[, 1:4], Yfill), \"${_output:n}\", delim = \"\\t\" )\n",
     "\n",
     "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
     "    bgzip -f ${_output:n}\n",
@@ -690,12 +788,27 @@
     "   library(\"dplyr\")\n",
     "    \n",
     "    pheno <- read_delim(${_input:ar}, delim = \"\\t\")\n",
-    "    pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    # Get index of features that have > 40% missing \n",
+    "    miss.ind <- which(rowSums(is.na(pheno[, 5:ncol(pheno)]))/(ncol(pheno) -4) > 0.4) \n",
+    "    # Get index of features where more than 95% values are zero.\n",
+    "    value0.ind <- which(rowSums((pheno[, 5:ncol(pheno)] == 0 | is.na(pheno[, 5:ncol(pheno)])))/(ncol(pheno) -4) > 0.95)\n",
+    "    miss.ind <- c(miss.ind, value0.ind)\n",
+    "    # remove these rows if qc_prior_to_impute = True\n",
+    "    if (${\"TRUE\" if qc_prior_to_impute else \"FALSE\"}) {\n",
+    "      if (length(miss.ind) > 0) {\n",
+    "         pheno_qc <- pheno[-miss.ind, ]\n",
+    "       } else {\n",
+    "         pheno_qc <- pheno\n",
+    "       }\n",
+    "      pheno_NAs <- pheno_qc[, 5:ncol(pheno_qc)]\n",
+    "    } else {\n",
+    "      pheno_NAs <- pheno[, 5:ncol(pheno)]\n",
+    "    }\n",
     "    Yfill <- as.matrix(pheno_NAs)\n",
     "    for (t.row in 1:nrow(pheno_NAs)) {\n",
     "        Yfill[t.row, is.na(Yfill[t.row,])] <- min(Yfill[t.row, ], na.rm = TRUE)\n",
     "    }\n",
-    "    write_delim(cbind(pheno[, 1:4], Yfill), ${_output:r}, delim = \"\\t\" )\n",
+    "    write_delim(cbind(pheno_qc[, 1:4], Yfill), \"${_output:n}\", delim = \"\\t\" )\n",
     "\n",
     "bash: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
     "    bgzip -f ${_output:n}\n",
@@ -732,7 +845,7 @@
     "# with missing rate smaller than tol_missing will be mean_imputed. Say if we want to keep rows with less than 5% missing, then we use 0.05 as tol_missing.\n",
     "parameter: tol_missing = 0.05\n",
     "input: phenoFile\n",
-    "output: f'{_input:nn}.filter_na.{impute_method}.bed.gz'\n",
+    "output: f'{cwd:a}/{_input:nn}.filter_na.{impute_method}.bed.gz'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads\n",
     "R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container=container, entrypoint=entrypoint\n",
     "   library(\"dplyr\")\n",
diff --git a/code/data_preprocessing/phenotype/xgb_imp.R b/code/data_preprocessing/phenotype/xgb_imp.R
new file mode 100644
index 000000000..57dc1be98
--- /dev/null
+++ b/code/data_preprocessing/phenotype/xgb_imp.R
@@ -0,0 +1,183 @@
+#options(repos = c(CRAN = "https://cloud.r-project.org/"))
+#install.packages("xgboost")
+library(xgboost)
+#library(caret)
+#library(tidyverse)
+library(doParallel)
+library(doRNG)
+library(itertools)
+numCores <- parallel::detectCores()
+#cl <- makeCluster(5)
+registerDoParallel(numCores - 1)
+
+#source('./SimulationDesign1/constants.R')
+#source('./SimulationDesign1/simulateData.R')
+#Y_sim <- simulateData()$Y
+#Y_mis <- generate_missing(Y_sim)
+
+xgboost_imputation <- function(data, maxiter = 10, verbose = TRUE, max.depth = 2, nrounds = 50, decreasing = FALSE, parallelize = 'variables') {
+  xmis <- data
+  #xmis <- Y_mis
+  n <- nrow(xmis)
+  p <- ncol(xmis)
+  
+  # Remove complete missing columns
+  if (any(apply(is.na(xmis), 2, sum) == n)){
+    indCmis <- which(apply(is.na(xmis), 2, sum) == n)
+    xmis <- xmis[,-indCmis]
+    p <- ncol(xmis)
+    cat('removed variable(s)', indCmis,
+        'due to the missingness of all entries\n')
+  } 
+  
+  # Initial mean imputation
+  ximp <- xmis
+  varType <- character(p)
+  for (t.co in 1:p) {
+    ximp[is.na(xmis[,t.co]),t.co] <- colMeans(xmis, na.rm = TRUE)[t.co] #mean(xmis[,1], na.rm = TRUE)
+    varType[t.co] <- 'numeric'
+    #if (is.numeric(xmis[[t.co]])) {
+    # varType[t.co] <- 'numeric'
+    #ximp[is.na(xmis[,t.co]),t.co] <- mean(xmis[,t.co], na.rm = TRUE)
+    #next()
+    #}
+  }
+  
+  # Extract missing location
+  NAloc <- is.na(xmis)            # where are missings
+  noNAvar <- apply(NAloc, 2, sum) # how many are missing in the vars
+  sort.j <- order(noNAvar)        # indices of increasing amount of NA in vars
+  #decreasing = FALSE
+  if (decreasing)
+    sort.j <- rev(sort.j)
+  sort.noNAvar <- noNAvar[sort.j]
+  
+  # Extract the columns to be imputed 
+  nzsort.j <- sort.j[sort.noNAvar > 0]
+  #parallelize <- 'variables'
+  if (parallelize == 'variables') {
+    '%cols%' <- get('%dorng%')
+    idxList <- as.list(isplitVector(nzsort.j, chunkSize = getDoParWorkers()))
+  } 
+  # print("idxList")
+  # print(idxList)
+  
+  ## initialize parameters of interest
+  iter <- 0
+  k <- length(unique(varType))
+  convNew <- rep(0, k)
+  names(convNew) <- c('numeric')
+  convOld <- rep(Inf, k)
+  convergence <- c()
+  #OOBerror <- numeric(p)
+  #names(OOBerror) <- varType
+  
+  #maxiter <- 10
+  Ximp <- vector('list', maxiter)
+  
+  # Stop criteria
+  stopCriterion <- function(varType, convNew, convOld, iter, maxiter){
+    k <- length(unique(varType))
+    if (k == 1){
+      (convNew < convOld) & (iter < maxiter)
+    } else {
+      ((convNew[1] < convOld[1]) | (convNew[2] < convOld[2])) & (iter < maxiter)
+    }
+  }
+  
+  # Main Imputation Function
+  while (stopCriterion(varType, convNew, convOld, iter, maxiter)) {
+    if (iter != 0) {
+      convOld <- convNew
+      # OOBerrOld <- OOBerr
+    }
+    if (verbose) {
+      cat("  XGBoost iteration", iter+1, "in progress...")
+    }
+    t.start <- proc.time()
+    ximp.old <- ximp
+    
+    for (idx in idxList) {
+      results <- foreach(varInd = idx, .packages = 'xgboost') %cols% {
+        obsi <- !NAloc[, varInd] # which i's are observed
+        misi <- NAloc[, varInd] # which i's are missing
+        print(paste("Dimensions of obsi:", dim(obsi)))
+        print(paste("Dimensions of misi:", dim(misi)))
+        obsY <- ximp[obsi, varInd, drop = FALSE] # training response
+        misY <- ximp[misi, varInd, drop = FALSE] # testing response
+        obsX <- ximp[obsi, seq(1, p)[-varInd], drop = FALSE] # training variables
+        misX <- ximp[misi, seq(1, p)[-varInd], drop = FALSE] # prediction variables
+        print(paste("Dimensions of obsX:", dim(obsX)))
+        print(paste("Dimensions of misX:", dim(misX)))
+        print(paste("Dimensions of misY:", dim(misY)))
+        print(paste("Dimensions of obsY:", dim(obsY)))
+        # print("obsX:")
+        # dim(obsX)
+        # # print(head(obsX))
+        # print("obsY:")
+        # dim(obsY)
+        # print(head(obsY))
+        # print("misX:")
+        # print(head(misX))
+        # print("misY:")
+        # print(head(misY))
+        # xgb_obsX = xgb.DMatrix(data = as.matrix(obsX), label = as.matrix(obsY)) # xgboost version of train
+        xgb_obsX <- tryCatch({
+          xgb.DMatrix(data = as.matrix(obsX), label = as.matrix(obsY))
+        }, error = function(e) {
+          print(paste("Error in creating xgb_obsX:", e$message))
+          print(paste("Dimensions of obsX:", dim(obsX)))
+          print(paste("Dimensions of obsY:", dim(obsY)))
+          NULL
+        })
+        xgb_misX = xgb.DMatrix(data = as.matrix(misX), label = as.matrix(misY))# misX contains the rows where the current variable (varInd) is missing. # misY also contains these missing values, which you're trying to predict. XGBoost doesn't expect labels for the data you're trying to predict.
+        if (inherits(xgb_misX, "try-error")) {
+          print("Error in creating xgb_misX")
+          print(xgb_misX)
+        }
+        #typeY <- varType[varInd]
+        print("xgb_obs:")
+        # print(head(xgb_obsX))
+        # print("xgb_mis:")
+        # print(head(xgb_misX))
+        # xgboost on train
+        xgb <- xgboost(data = xgb_obsX, max.depth = max.depth, nrounds = nrounds)
+        
+        misY <- predict(xgb, xgb_misX) ## predict missing values in column varInd
+        
+        list(varInd = varInd, misY = misY) #oerr = oerr
+      }
+      
+      for (res in results) {
+        misi <- NAloc[,res$varInd]
+        ximp[misi, res$varInd] <- res$misY
+        #OOBerror[res$varInd] <- res$oerr
+      }
+    }
+    
+    if (verbose){
+      cat('done!\n')
+    }
+    
+    iter <- iter + 1
+    Ximp[[iter]] <- ximp
+    
+    t.co2 <- 1
+    ## check the difference between iteration steps
+    for (t.type in names(convNew)){
+      t.ind <- which(varType == t.type)
+      convNew[t.co2] <- sum((ximp[, t.ind] - ximp.old[, t.ind])^2) / sum(ximp[, t.ind]^2)
+      t.co2 <- t.co2 + 1
+    }
+  }  
+  
+  # Extrtact the output
+  if (iter == maxiter) {
+    out <- Ximp[[iter]]
+  } else {
+    out <- Ximp[[iter - 1]]
+  }
+  
+  return(out)
+}
+

From 8f7d25001ccfffb79724cc27fec9fe66875fdaab Mon Sep 17 00:00:00 2001
From: rl3328 <rl3328@cumc.columbia.edu>
Date: Mon, 30 Sep 2024 23:11:31 -0400
Subject: [PATCH 3/4] Update containers.csv

---
 container/containers.csv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/container/containers.csv b/container/containers.csv
index 3ec20add3..99abd014c 100644
--- a/container/containers.csv
+++ b/container/containers.csv
@@ -73,7 +73,7 @@ trimmomatic,https://github.com/timflutre/trimmomatic,Java,0.39,N/A,bioconda,rna_
 gtex-pipeline-qtl,https://github.com/broadinstitute/gtex-pipeline/tree/master/qtl,Python,gtex_v8,bed31e12a48b25db8b111252e2136b31c64c9e82,dnachun,rna_quantification,Other,https://anaconda.org/dnachun/gtex-pipeline-qtl,
 pandas,https://pypi.org/project/pandas/,Python,1.3.5,N/A,conda-forge,"rna_quantification, tensorqtl, quantqtl",BSD-3-Clause,https://anaconda.org/conda-forge/pandas,
 r-mvsusier,https://github.com/stephenslab/mvsusieR,R,0.1.7,N/A,dnachun,pecotmr,GPL-3.0-or-later,https://anaconda.org/dnachun/r-mvsusier,
-r-pecotmr,https://github.com/cumc/pecotmr,R,0.1.70,N/A,dnachun,pecotmr,GPL-3.0-or-later,https://anaconda.org/dnachun/r-pecotmr,
+r-pecotmr,https://github.com/cumc/pecotmr,R,0.1.71,N/A,dnachun,pecotmr,GPL-3.0-or-later,https://anaconda.org/dnachun/r-pecotmr,
 r-fsusier,https://github.com/stephenslab/fsusier,R,0.2.0,dd4df3668d2ca59dd34a52b6c1b426054a998fbd,dnachun,pecotmr,GPL-3.0-or-later,https://anaconda.org/dnachun/r-fsusier,
 backports.zoneinfo,https://pypi.org/project/backports.zoneinfo/,Python,0.2.1,N/A,conda-forge,"pecotmr, quantqtl",Apache-2.0,https://anaconda.org/conda-forge/backports.zoneinfo,
 bioconductor-variantannotation,https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html,R,1.48.1,N/A,bioconda,"pecotmr, quantqtl",Artistic-2.0,https://anaconda.org/bioconda/bioconductor-variantannotation,
@@ -134,4 +134,4 @@ bioconductor-edger,https://bioconductor.org/packages/3.18/bioc/html/edgeR.html,R
 bioconductor-limma,https://bioconductor.org/packages/3.18/bioc/html/limma.html,R,3.58.1,N/A,bioconda,seurat,GPL-2.0-or-later,https://anaconda.org/bioconda/bioconductor-limma,
 r-profvis,https://rstudio.github.io/profvis/,R,,N/A,conda-forge,quantqtl,GPL-3.0-or-later,https://anaconda.org/conda-forge/r-profvis,
 r-bigsnpr,https://privefl.github.io/bigsnpr/,R,1.12.2,N/A,conda-forge,pecotmr,GPL-3.0-or-later,https://anaconda.org/conda-forge/r-bigsnpr,
-quantas,https://github.com/chaolinzhanglab/quantas,Perl,1.1.2,N/A,dnachun,quantas,Other,https://anaconda.org/dnachun/quantas,
\ No newline at end of file
+quantas,https://github.com/chaolinzhanglab/quantas,Perl,1.1.2,N/A,dnachun,quantas,Other,https://anaconda.org/dnachun/quantas,

From 657b110aa74bbabf9291eeb9236e58ac5e79096b Mon Sep 17 00:00:00 2001
From: rl3328 <rl3328@users.noreply.github.com>
Date: Tue, 1 Oct 2024 03:12:01 +0000
Subject: [PATCH 4/4] Update YAML files and Markdown table

---
 container/README.md            | 2 +-
 container/pecotmr/pecotmr.yml  | 2 +-
 container/readme/containers.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/container/README.md b/container/README.md
index 452b022f1..f6549fa90 100644
--- a/container/README.md
+++ b/container/README.md
@@ -99,7 +99,7 @@ On occasion, you may see errors that a download or upload has failed while build
 | r-bigsnpr                                         | [Link](https://privefl.github.io/bigsnpr/)                                                                       | [Link](https://anaconda.org/conda-forge/r-bigsnpr)                                      | pecotmr                                                                                                 | 1.12.2     | conda&#x2011;forge | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
 | r-fsusier                                         | [Link](https://github.com/stephenslab/fsusier)                                                                   | [Link](https://anaconda.org/dnachun/r-fsusier)                                          | pecotmr                                                                                                 | 0.2.0      | dnachun            | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
 | r-mvsusier                                        | [Link](https://github.com/stephenslab/mvsusieR)                                                                  | [Link](https://anaconda.org/dnachun/r-mvsusier)                                         | pecotmr                                                                                                 | 0.1.7      | dnachun            | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
-| r-pecotmr                                         | [Link](https://github.com/cumc/pecotmr)                                                                          | [Link](https://anaconda.org/dnachun/r-pecotmr)                                          | pecotmr                                                                                                 | 0.1.70     | dnachun            | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
+| r-pecotmr                                         | [Link](https://github.com/cumc/pecotmr)                                                                          | [Link](https://anaconda.org/dnachun/r-pecotmr)                                          | pecotmr                                                                                                 | 0.1.71     | dnachun            | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
 | r-susier                                          | [Link](https://github.com/stephenslab/susieR)                                                                    | [Link](https://anaconda.org/conda-forge/r-susier)                                       | pecotmr, fastenloc                                                                                      | 0.14.7     | conda&#x2011;forge | MIT                                    |
 | backports.zoneinfo                                | [Link](https://pypi.org/project/backports.zoneinfo/)                                                             | [Link](https://anaconda.org/conda-forge/backports.zoneinfo)                             | pecotmr, quantqtl                                                                                       | 0.2.1      | conda&#x2011;forge | Apache&#x2011;2.0                      |
 | bioconductor-variantannotation                    | [Link](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html)                               | [Link](https://anaconda.org/bioconda/bioconductor-variantannotation)                    | pecotmr, quantqtl                                                                                       | 1.48.1     | bioconda           | Artistic&#x2011;2.0                    |
diff --git a/container/pecotmr/pecotmr.yml b/container/pecotmr/pecotmr.yml
index e44262dd1..cca0ce6bf 100644
--- a/container/pecotmr/pecotmr.yml
+++ b/container/pecotmr/pecotmr.yml
@@ -16,7 +16,7 @@ dependencies:
   - r-genio=1.1.2
   - r-gwasrapidd=0.99.14
   - r-mvsusier=0.1.7
-  - r-pecotmr=0.1.70
+  - r-pecotmr=0.1.71
   - r-susier=0.14.7
   - r-udr=0.3.154
   - altair
diff --git a/container/readme/containers.md b/container/readme/containers.md
index 2f984bffb..625af59c2 100644
--- a/container/readme/containers.md
+++ b/container/readme/containers.md
@@ -48,7 +48,7 @@
 | r-bigsnpr                                         | [Link](https://privefl.github.io/bigsnpr/)                                                                       | [Link](https://anaconda.org/conda-forge/r-bigsnpr)                                      | pecotmr                                                                                                 | 1.12.2     | conda&#x2011;forge | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
 | r-fsusier                                         | [Link](https://github.com/stephenslab/fsusier)                                                                   | [Link](https://anaconda.org/dnachun/r-fsusier)                                          | pecotmr                                                                                                 | 0.2.0      | dnachun            | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
 | r-mvsusier                                        | [Link](https://github.com/stephenslab/mvsusieR)                                                                  | [Link](https://anaconda.org/dnachun/r-mvsusier)                                         | pecotmr                                                                                                 | 0.1.7      | dnachun            | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
-| r-pecotmr                                         | [Link](https://github.com/cumc/pecotmr)                                                                          | [Link](https://anaconda.org/dnachun/r-pecotmr)                                          | pecotmr                                                                                                 | 0.1.70     | dnachun            | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
+| r-pecotmr                                         | [Link](https://github.com/cumc/pecotmr)                                                                          | [Link](https://anaconda.org/dnachun/r-pecotmr)                                          | pecotmr                                                                                                 | 0.1.71     | dnachun            | GPL&#x2011;3.0&#x2011;or&#x2011;later  |
 | r-susier                                          | [Link](https://github.com/stephenslab/susieR)                                                                    | [Link](https://anaconda.org/conda-forge/r-susier)                                       | pecotmr, fastenloc                                                                                      | 0.14.7     | conda&#x2011;forge | MIT                                    |
 | backports.zoneinfo                                | [Link](https://pypi.org/project/backports.zoneinfo/)                                                             | [Link](https://anaconda.org/conda-forge/backports.zoneinfo)                             | pecotmr, quantqtl                                                                                       | 0.2.1      | conda&#x2011;forge | Apache&#x2011;2.0                      |
 | bioconductor-variantannotation                    | [Link](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html)                               | [Link](https://anaconda.org/bioconda/bioconductor-variantannotation)                    | pecotmr, quantqtl                                                                                       | 1.48.1     | bioconda           | Artistic&#x2011;2.0                    |