Update documentation

cumc · Mar 15, 2024 · c8bfeda · c8bfeda
1 parent 576f8c0
commit c8bfeda
Show file tree

Hide file tree

Showing 13 changed files with 661 additions and 372 deletions.
diff --git a/_sources/code/data_preprocessing/genotype/VCF_QC.ipynb b/_sources/code/data_preprocessing/genotype/VCF_QC.ipynb
@@ -157,7 +157,7 @@
    "source": [
     "sos run VCF_QC.ipynb rename_chrs \\\n",
     "    --genoFile reference_data/00-All.vcf.gz \\\n",
-    "    --cwd reference_data --container bioinfo.sif"
+    "    --cwd reference_data --container oras://ghcr.io/cumc/bioinfo_apptainer:latest"
    ]
   },
   {
@@ -174,7 +174,7 @@
    "source": [
     "sos run VCF_QC.ipynb dbsnp_annotate \\\n",
     "    --genoFile reference_data/00-All.add_chr.vcf.gz \\\n",
-    "    --cwd reference_data --container bioinfo.sif"
+    "    --cwd reference_data --container oras://ghcr.io/cumc/bioinfo_apptainer:latest"
    ]
   },
   {
@@ -193,7 +193,7 @@
     "    --genoFile data/MWE/MWE_genotype.vcf     \\\n",
     "    --dbsnp-variants data/reference_data/00-All.add_chr.variants.gz  \\\n",
     "    --reference-genome data/reference_data/GRCh38_full_analysis_set_plus_decoy_hla.noALT_noHLA_noDecoy_ERCC.fasta   \\\n",
-    "    --cwd MWE/output/genotype_1 --container bioinfo.sif -J 1 -c csg.yml -q csg"
+    "    --cwd MWE/output/genotype_1 --container oras://ghcr.io/cumc/bioinfo_apptainer:latest"
    ]
   },
   {
@@ -222,7 +222,7 @@
     "    --genoFile data/mwe/mwe_genotype_list    \\\n",
     "    --dbsnp-variants data/reference_data/00-All.add_chr.variants.gz  \\\n",
     "    --reference-genome data/reference_data/GRCh38_full_analysis_set_plus_decoy_hla.noALT_noHLA_noDecoy_ERCC.fasta   \\\n",
-    "    --cwd MWE/output/genotype_4 --container bioinfo.sif --add-chr"
+    "    --cwd MWE/output/genotype_4 --container oras://ghcr.io/cumc/bioinfo_apptainer:latest"
    ]
   },
   {
@@ -542,9 +542,9 @@
    },
    "outputs": [],
    "source": [
-    "[rename_chrs: provides = '{filename}.add_chr.vcf.gz']\n",
+    "[rename_chrs: provides = '{genoFile:nn}.add_chr.vcf.gz']\n",
     "# This file can be downloaded from https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh38p7/VCF/00-All.vcf.gz.\n",
-    "input: f'{filename}.vcf.gz'\n",
+    "input: genoFile\n",
     "output: f'{_input:nn}.add_chr.vcf.gz'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
     "bash: container = container, expand= \"${ }\", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout', entrypoint=entrypoint\n",

diff --git a/_sources/code/data_preprocessing/phenotype/gene_annotation.ipynb b/_sources/code/data_preprocessing/phenotype/gene_annotation.ipynb
@@ -1191,6 +1191,7 @@
     "parameter: sample_participant_lookup = path()\n",
     "input: phenoFile, annotation_gtf\n",
     "output: f'{cwd:a}/{_input[0]:bn}.formated.bed.gz', f'{cwd:a}/{_input[0]:bn}.phenotype_group.txt'\n",
+    "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output[0]:bn}'  \n",
     "python: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container, entrypoint=entrypoint\n",
     "    import pandas as pd\n",
     "    import numpy as np\n",

diff --git a/_sources/code/molecular_phenotypes/QC/splicing_normalization.ipynb b/_sources/code/molecular_phenotypes/QC/splicing_normalization.ipynb
@@ -780,6 +780,8 @@
     "    names(bed_file)[1] <- \"#Chr\"\n",
     "    bed_file$'#Chr' <- sub(\"^\", \"chr\", bed_file$'#Chr')\n",
     "    row.names(bed_file) <- NULL   \n",
+    "\n",
+    "    bed_file <- bed_file[order(bed_file$'#Chr',bed_file$start,bed_file$end),]\n",
     "  \n",
     "    # Create BED file output\n",
     "    write.table(x=bed_file, file = \"${cwd}/psichomics_raw_data_bedded.txt\", quote = FALSE, row.names = FALSE, sep = \"\\t\")"

diff --git a/_sources/code/molecular_phenotypes/calling/RNA_calling.ipynb b/_sources/code/molecular_phenotypes/calling/RNA_calling.ipynb
diff --git a/_sources/code/multivariate_genome/MASH/mash_preprocessing.ipynb b/_sources/code/multivariate_genome/MASH/mash_preprocessing.ipynb
@@ -342,8 +342,8 @@
     "    res <- list()\n",
     "    for (i in 1:nrow(meta_df)) {\n",
     "      line <- meta_df[i,]\n",
-    "      region_file <- line[6]\n",
-    "      sumstats_db_file <- line[7]\n",
+    "      region_file <- line[7]\n",
+    "      sumstats_db_file <- line[8]\n",
     "      strong_file <- load_multitrait_R_sumstat(readRDS(region_file), readRDS(sumstats_db_file), coverage = \"${coverage}\", top_loci=TRUE, exclude_condition = exclude_condition)\n",
     "      ran_null_file <- load_multitrait_R_sumstat(readRDS(region_file), readRDS(sumstats_db_file), filter_file=${independent_variant_list:r}, exclude_condition = exclude_condition)\n",
     "      ran_null <- mash_rand_null_sample(ran_null_file, ${n_random}, ${n_null}, exclude_condition = exclude_condition, seed=${seed})\n",

diff --git a/_sources/code/post_processing/fine_mapping_post_processing.ipynb b/_sources/code/post_processing/fine_mapping_post_processing.ipynb
@@ -251,6 +251,7 @@
    },
    "outputs": [],
    "source": [
+    "#susie\n",
     "sos run xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb  cis_results_export \\\n",
     "    --region_file ~/codes/fungen-xqtl-analysis/resource/TADB_enhanced_cis.protein_coding.bed \\\n",
     "    --file_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024_new/rds_files/     \\\n",
@@ -262,6 +263,28 @@
     "    --min_corr 0.8 \n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f99ee4f4-b614-467c-8336-4471c749a975",
+   "metadata": {
+    "kernel": "SoS"
+   },
+   "outputs": [],
+   "source": [
+    "#fsusie\n",
+    "sos run xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb  cis_results_export \\\n",
+    "    --region_file /mnt/vast/hpc/homes/rf2872/codes/fungen-xqtl-analysis/resource/TADB_region.bed \\\n",
+    "    --file_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024_new/rds_files/     \\\n",
+    "    --name Fungen_xQTL \\\n",
+    "    --cwd output \\\n",
+    "    --suffix fsusie_mixture_normal_top_pc_weights.rds  \\\n",
+    "    --prefix ROSMAP_haQTL \\\n",
+    "    --data_type qtl \\\n",
+    "    --min_corr 0.8 \\\n",
+    "    --fsusie True"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "65f3b184-552a-4ca0-b11d-0e56516ed6ff",
@@ -313,10 +336,10 @@
    "outputs": [],
    "source": [
     "sos run ~/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb  overlap_qtl_gwas \\\n",
-    "    --qtl-files `ls /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024/data/*/*export.rds  ` \\\n",
-    "    --gwas-union-file  /mnt/vast/hpc/csg/rf2872/Work/Multivariate/gwas/output/ADGWAS.union_export.tsv.gz \\\n",
     "    --name ROSMAP_eQTL \\\n",
-    "    -c ~/env_files/csg.yml -J 50"
+    "    --qtl_meta_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/gwas/output/Fungen_xQTL.cis_results_db.tsv \\\n",
+    "    --gwas_meta_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/gwas/ADGWAS_Mar7/ADGWAS_Feb22.block_results_db.tsv \\\n",
+    "    --cwd overlap_test"
    ]
   },
   {
@@ -1390,6 +1413,7 @@
     "# the analysis would be focused on the union of provides region list and region names\n",
     "parameter: region_name = []\n",
     "parameter: min_corr = 0.5\n",
+    "parameter: fsusie = False\n",
     "import pandas as pd\n",
     "import os\n",
     "\n",
@@ -1535,7 +1559,30 @@
     "      names(pip) = susie_obj$variant_names\n",
     "      return(pip)\n",
     "    }\n",
+    "  \n",
+    "    # add column in top loci to indicate if the variant is identified in susie_on_top_pc top loci table or not\n",
+    "    annotate_susie <- function(obj, top_loci){\n",
+    "        df <- data.frame()\n",
+    "\n",
+    "        results <- lapply(obj[['susie_on_top_pc']], function(x) {\n",
+    "          x[['top_loci']]\n",
+    "        })\n",
+    "\n",
+    "        df <- bind_rows(results)\n",
     "\n",
+    "        top_loci_others <-  df %>%\n",
+    "          filter(rowSums(select(., starts_with('cs_coverage_'))) == 0)\n",
+    "        top_loci_cs <- df %>%\n",
+    "          filter(rowSums(select(., starts_with('cs_coverage_'))) > 0)      \n",
+    "\n",
+    "        susie_vars <- rbind(top_loci_others %>% filter(pip >= 0.05), top_loci_cs) %>% pull(variant_id)\n",
+    "        susie_vars_cs <- top_loci_cs %>% pull(variant_id)\n",
+    "        top_loci <- top_loci %>% mutate(annotated_susie = ifelse(variant_id %in% susie_vars, 1, 0),\n",
+    "                           annotated_susie_cs = ifelse(variant_id %in% susie_vars_cs, 1, 0))\n",
+    "\n",
+    "        return(top_loci)\n",
+    "    }\n",
+    "  \n",
     "    # Process each path and collect results\n",
     "    orig_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input])})\n",
     "     # Extract info from each RDS file\n",
@@ -1568,8 +1615,9 @@
     "          if(length(conditions) > 0) {\n",
     "  \n",
     "            for(condition in conditions) {\n",
-    "                dat_con <- dat_study[[condition]]\n",
-    "                dat_susie <- dat_con$susie_result_trimmed\n",
+    "                dat_con <- dat_study[[condition]]${[['fsusie_summary']] if fsusie else ''}\n",
+    "                dat_susie <- dat_con$susie_result_trimme\n",
+    "\n",
     "  \n",
     "                if(${\"TRUE\" if condition_meta.is_file() else \"FALSE\"}){\n",
     "                      meta <- suppressMessages(read_delim(\"${condition_meta}\", col_names = F))\n",
@@ -1583,11 +1631,20 @@
     "                res[[gene]][[s]][[qtl_con]] <- list(\n",
     "                  region_info = dat_con$region_info,\n",
     "                  top_loci = process_top_loci(dat_susie, dat_con$top_loci),\n",
-    "                  preset_top_loci = process_top_loci(dat_susie, dat_con$preset_variants_result$top_loci),\n",
     "                  pip = get_pip_withname(dat_con),\n",
-    "                  preset_pip = get_pip_withname(dat_con$preset_variants_result),\n",
     "                  CV_table = dat_con$twas_cv_result$performance\n",
     "                )\n",
+    "                if ('${data_type}' == 'qtl') { \n",
+    "                    # fsusie saved preset results in different layer\n",
+    "                    if (${\"TRUE\" if fsusie else \"FALSE\"}){\n",
+    "                          res[[gene]][[s]][[qtl_con]][['top_loci']] = annotate_susie(dat_study[[condition]], res[[gene]][[s]][[qtl_con]][['top_loci']])\n",
+    "                          res[[gene]][[s]][[qtl_con]][['preset_top_loci']] = process_top_loci(dat_susie, dat_study[[condition]]$preset_variants_result$top_loci) %>% annotate_susie(dat_study[[condition]], .)\n",
+    "                          res[[gene]][[s]][[qtl_con]][['preset_pip']] = get_pip_withname(dat_study[[condition]]$preset_variants_result)\n",
+    "                    } else {\n",
+    "                          res[[gene]][[s]][[qtl_con]][['preset_top_loci']] = process_top_loci(dat_susie, dat_con$preset_variants_result$top_loci)\n",
+    "                          res[[gene]][[s]][[qtl_con]][['preset_pip']] = get_pip_withname(dat_con$preset_variants_result)\n",
+    "                    }\n",
+    "                }\n",
     "  \n",
     "                res_sum[[gene]][[s]][[qtl_con]] <- list(\n",
     "                  variant_names = dat_con$variant_names,\n",
@@ -1618,10 +1675,12 @@
     "    saveRDS(res, combine_data)\n",
     "    saveRDS(res_sum, combine_data_sumstats)\n",
     "    TSS <- tryCatch({dat_con$region_info$region_coord$start},  error = function(e) {return(NA)})\n",
+    "    \n",
+    "    if (${\"TRUE\" if fsusie else \"FALSE\"}) conditions = paste(names(res[[1]]), collapse = \",\") else conditions = paste(names(res[[gene]]), collapse = \",\")\n",
     "  \n",
-    "    meta = data.frame(chr=\"${_meta_info[0]}\", start=\"${_meta_info[1]}\", end=\"${_meta_info[2]}\", gene_id=\"${_meta_info[3]}\", TSS =  if(is.null(TSS)) NA else TSS, \n",
-    "                      original_data = paste(basename(orig_files), collapse = \", \"), combined_data = basename(combine_data), combined_data_sumstats = basename(combine_data_sumstats), \n",
-    "                      conditions = paste(names(res[[gene]]), collapse = \",\"), \n",
+    "    meta = data.frame(chr=\"${_meta_info[0]}\", start=\"${_meta_info[1]}\", end=\"${_meta_info[2]}\", region_id=\"${_meta_info[3]}\", TSS =  if(is.null(TSS)) NA else TSS, \n",
+    "                      original_data = paste(basename(orig_files), collapse = \",\"), combined_data = basename(combine_data), combined_data_sumstats = basename(combine_data_sumstats), \n",
+    "                      conditions = conditions, \n",
     "                      conditions_top_loci = if(length(cons_top_loci) > 0) cons_top_loci[[1]] %>% unlist %>% names %>% as.character %>% paste(., collapse = ',') else '')\n",
     "    write_delim(meta, \"${_output}\", delim = '\\t')\n",
     "  "
@@ -1668,7 +1727,7 @@
     "        if (${\"TRUE\" if exported_file.is_file() else \"FALSE\"}){\n",
     "          exp_meta <- read_delim(${_output:r}, delim = '\\t')\n",
     "          meta <- bind_rows(meta, exp_meta) %>%\n",
-    "              group_by(gene_id) %>%\n",
+    "              group_by(region_id) %>%\n",
     "              summarise(across(c(original_data, combined_data, combined_data_sumstats, conditions, conditions_top_loci), \n",
     "                               ~paste(unique(.), collapse = \",\")),\n",
     "                        .groups = 'drop')\n",
@@ -1761,9 +1820,9 @@
     "[overlap_qtl_gwas_1]\n",
     "parameter: per_chunk = 100\n",
     "parameter: gwas_meta_path = path()\n",
-    "parameter: qtl_file_path = ''\n",
-    "parameter: gwas_file_path = ''\n",
     "parameter: qtl_meta_path = path()\n",
+    "parameter: gwas_file_path = ''\n",
+    "parameter: qtl_file_path = ''\n",
     "# Optional: if a region list is provide the analysis will be focused on provided region. \n",
     "# The LAST column of this list will contain the ID of regions to focus on\n",
     "# Otherwise, all regions with both genotype and phenotype files will be analyzed\n",
@@ -1775,6 +1834,11 @@
     "import numpy as np\n",
     "from pathlib import Path\n",
     "\n",
+    "if qtl_file_path == '':\n",
+    "    qtl_file_path = qtl_meta_path.parent\n",
+    "if gwas_file_path == '':\n",
+    "    gwas_file_path = gwas_meta_path.parent\n",
+    "    \n",
     "# Load the data, suppressing messages is not typically done in pandas as it does not inherently output messages when loading files\n",
     "gwas_meta = pd.read_csv(gwas_meta_path, sep='\\t', low_memory=False)\n",
     "gwas_meta = gwas_meta[gwas_meta['conditions_top_loci'].notna()]\n",
@@ -1798,7 +1862,7 @@
     "    region_ids = list(set(region_ids).union(set(region_name)))\n",
     "    \n",
     "if len(region_ids) > 0:\n",
-    "    qtl_meta = qtl_meta[qtl_meta['gene_id'].isin(region_ids)]\n",
+    "    qtl_meta = qtl_meta[qtl_meta['region_id'].isin(region_ids)]\n",
     "    \n",
     "def group_by_region(lst, partition):\n",
     "    # from itertools import accumulate\n",
@@ -1835,7 +1899,7 @@
     "\n",
     "\n",
     "regional_data = {\n",
-    "    'meta': [(row['#chr'], row['start'], row['end'], row['gene_id'], row['gwas_file'].split(',')) for _, row in qtl_meta_filtered.iterrows()],\n",
+    "    'meta': [(row['#chr'], row['start'], row['end'], row['region_id'], row['gwas_file'].split(',')) for _, row in qtl_meta_filtered.iterrows()],\n",
     "    'qtl_data': [f\"{qtl_file_path}/{row['combined_data']}\" for _, row in qtl_meta_filtered.iterrows()]\n",
     "}\n",
     "\n",
@@ -1981,7 +2045,7 @@
     "    }\n",
     " \n",
     "    qtl_meta <- suppressMessages(read_delim('${qtl_meta_path}'))\n",
-    "    qtl_meta <- qtl_meta %>% filter(gene_id == '${_meta_info[3]}')  %>% mutate(block_top_loci = block_top_loci,\n",
+    "    qtl_meta <- qtl_meta %>% filter(region_id == '${_meta_info[3]}')  %>% mutate(block_top_loci = block_top_loci,\n",
     "                                                     final_combined_data = final_combined_data)\n",
     "    fwrite(qtl_meta, ${_output:r})"
    ]