Update documentation

cumc · Mar 18, 2024 · 36de0ae · 36de0ae
1 parent c8bfeda
commit 36de0ae
Show file tree

Hide file tree

Showing 9 changed files with 134 additions and 104 deletions.
diff --git a/_sources/code/molecular_phenotypes/calling/RNA_calling.ipynb b/_sources/code/molecular_phenotypes/calling/RNA_calling.ipynb
@@ -1372,7 +1372,7 @@
     "bash:  container=container,expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', entrypoint=entrypoint\n",
     "        fastp -i ${f'{_input[0]} -I {_input[1]}' if is_paired_end else _input} -o ${ f'{_output[0]} -O {_output[1]}' if is_paired_end else _output } \\\n",
     "            ${f'--adapter_fasta {fasta_with_adapters_etc}' if fasta_with_adapters_etc.is_file() else \"--detect_adapter_for_pe\"}  -V -h ${_output[0]:n}.html -j ${_output[0]:n}.json -w ${numThreads} \\\n",
-    "            --length_required ${min_len}  -W ${window_size} -M ${required_quality} -5 -3 --cut_front_mean_quality ${leading} --cut_tail_mean_quality ${leading}\n"
+    "            --length_required ${min_len}  -W ${window_size} -M ${required_quality} -5 -3 --cut_front_mean_quality ${leading} --cut_tail_mean_quality ${leading}"
    ]
   },
   {
@@ -1769,7 +1769,7 @@
     "parameter: reference_fasta = path\n",
     "# For the patterned flowcell models (HiSeq X), change to 2500\n",
     "parameter: optical_distance = 100\n",
-    "parameter: zap_raw_bam = True\n",
+    "parameter: zap_raw_bam = False\n",
     "picard_strand_dict = {\"rf\":\"SECOND_READ_TRANSCRIPTION_STRAND\",\"fr\": \"FIRST_READ_TRANSCRIPTION_STRAND\",\"unstranded\":\"NONE\" }\n",
     "input: output_from(\"STAR_align_1\"), group_by = 2, group_with = {\"sample_id\",\"strand\"}\n",
     "output: picard_metrics = f'{_sample_id}.alignment_summary_metrics',\n",
@@ -1821,7 +1821,7 @@
     "        -VALIDATION_STRINGENCY STRICT \\\n",
     "        -REMOVE_SEQUENCING_DUPLICATES false \\\n",
     "        -REMOVE_DUPLICATES false\n",
-    "    \n",
+    "    samtools index ${_output[2]}\n",
     "    # Record end timestamp and calculate runtime for MarkDuplicates\n",
     "    end_time=$(date +%s)\n",
     "    runtime=$((end_time - start_time))\n",
@@ -1845,6 +1845,7 @@
     "        strand = []\n",
     "        tag = []\n",
     "        annotation_gtf = \"${gtf}.tmp.${_input[\"cord_bam\"]:bnnnn}\"\n",
+    "\n",
     "        with open(annotation_gtf, 'r') as gtf:\n",
     "            for row in gtf:\n",
     "                row = row.strip().split('\\t')\n",
@@ -1906,16 +1907,16 @@
     "    }\n",
     "\n",
     "    export -f generate_bigwig  # To make it available in subshells, if needed.\n",
-    "    generate_bigwig ${_output[\"md_bam\"]}\n",
+    "    generate_bigwig $[_output[\"md_bam\"]]\n",
     "\n",
     "import pandas as pd\n",
     "out = pd.DataFrame({\n",
     "    \"sample_id\": _sample_id,\n",
     "    \"strand\": _strand,\n",
-    "    \"coord_bam_list\": _output[\"md_bam\"],\n",
-    "    \"BW_list\": _output[\"bigwig\"],\n",
+    "    \"coord_bam_list\": f'{_output[\"md_bam\"]:b}',\n",
+    "    \"BW_list\": f'{_output[\"bigwig\"]:b}',\n",
     "    \"SJ_list\": f'{_output[\"md_bam\"]:bnnnnn}.SJ.out.tab',\n",
-    "    \"trans_bam_list\": f'{_output[\"md_bam\"]:bnnnn}.toTranscriptome.out.bam'\n",
+    "    \"trans_bam_list\": f'{_output[\"md_bam\"]:bnnnn}.toTranscriptome.out_wasp_qc.bam'\n",
     "\n",
     "})\n",
     "out.to_csv(_output[\"output_summary\"], sep=\"\\t\", index=False)\n",

diff --git a/_sources/code/multivariate_genome/MASH/mash_preprocessing.ipynb b/_sources/code/multivariate_genome/MASH/mash_preprocessing.ipynb
@@ -405,9 +405,9 @@
     "      renamed_res_reformatted <- rename_rownames(res_reformatted)\n",
     "      merged_data <- list()\n",
     "      for(region in names(renamed_res_reformatted)){\n",
-    "        merged_data <- merge_data(merged_data, renamed_res_reformatted[[region]])\n",
+    "        merged_data <- merge_mash_data(merged_data, renamed_res_reformatted[[region]])\n",
     "      }\n",
-    "      batch_combined_data <- merge_data(batch_combined_data,merged_data)\n",
+    "      batch_combined_data <- merge_mash_data(batch_combined_data,merged_data)\n",
     "    } \n",
     "   conditions = c(\"strong\", \"random\", \"null\")\n",
     "   for (cond in conditions){\n",

diff --git a/_sources/code/multivariate_genome/MASH/mixture_prior.ipynb b/_sources/code/multivariate_genome/MASH/mixture_prior.ipynb
@@ -748,14 +748,13 @@
     "    fit0 <- ud_init(mash_data, n_unconstrained = 50, U_scaled = U.can)\n",
     "    # Fit udr and use penalty as default as suggested by Yunqi\n",
     "    # penalty is necessary in small sample size case, and there won't be a difference in large sample size \n",
-    "    fit2 = ud_fit(fit0, control = list(unconstrained.update = \"ted\", scaled.update = \"fa\", resid.update = 'none', \n",
+    "    fit2 = ud_fit(fit0, control = list(unconstrained.update = \"${unconstrained_prior}\", scaled.update = \"fa\", resid.update = 'none', \n",
     "                                       lambda =lambda, penalty.type = \"iw\", maxiter=1e3, tol = 1e-2, tol.lik = 1e-2))\n",
     "\n",
     "    # extract data-driven covariance from udr model. (A list of covariance matrices)\n",
     "    U.ud <- lapply(fit2$U,function (e) \"[[\"(e,\"mat\")) \n",
     "\n",
-    "    saveRDS(list(U=U.ud, w=fit2$w, loglik=fit2$loglik), ${_output:r})\n",
-    "\n"
+    "    saveRDS(list(U=U.ud, w=fit2$w, loglik=fit2$loglik), ${_output:r})"
    ]
   },
   {
@@ -766,7 +765,7 @@
    },
    "outputs": [],
    "source": [
-    "[ted]\n",
+    "[ud_unconstrained]\n",
     "# Method is `ed` or `ted`\n",
     "parameter: ud_method = \"ed\"\n",
     "# A typical choice is to estimate scales only for canonical components\n",
@@ -964,7 +963,7 @@
      "SoS"
     ]
    ],
-   "version": "0.24.0"
+   "version": "0.24.3"
   }
  },
  "nbformat": 4,

diff --git a/_sources/code/pecotmr_integration/twas_mr.ipynb b/_sources/code/pecotmr_integration/twas_mr.ipynb
@@ -131,6 +131,22 @@
     "    - `I2`: $I^2$ statistics"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "kernel": "SoS"
+   },
+   "source": [
+    "## Example\n",
+    "```\n",
+    "sos run ~/githubrepo/xqtl-pipeline/code/pecotmr_integration/twas_mr.ipynb twas_mr \\\n",
+    "   --cwd /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/output/ \\\n",
+    "   --gwas_meta_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/gwas_meta.tsv \\\n",
+    "   --ld_meta_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/ld_meta_file.tsv \\\n",
+    "   --xqtl_meta_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/xqtl_meta.tsv \n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -285,7 +301,7 @@
     "        chrom_range = range(1, 23) if row['chrom'] == 0 else [row['chrom']]\n",
     "        for chrom in chrom_range:\n",
     "            if chrom in gwas_dict[row['study_id']]:\n",
-    "         e       existing_entry = gwas_dict[row['study_id']][chrom]\n",
+    "                existing_entry = gwas_dict[row['study_id']][chrom]\n",
     "                raise ValueError(f\"Duplicate chromosome specification for study_id {row['study_id']}, chrom {chrom}. \"\n",
     "                                 f\"Conflicting entries: {existing_entry} and {[file_path, mapping_file]}\")\n",
     "            gwas_dict[row['study_id']][chrom] = [file_path, mapping_file]\n",
@@ -316,12 +332,11 @@
     "depends: sos_variable(\"regional_data\")\n",
     "\n",
     "parameter: allele_qc = True\n",
-    "parameter: sets = \"sets\"\n",
-    "parameter: coverage = \"coverage_0.95\"\n",
+    "parameter: coverage = \"cs_coverage_0.95\"\n",
     "meta_info = [x[\"meta_info\"] for x in regional_data['xQTL'].values()]\n",
     "xqtl_files = [x[\"files\"] for x in regional_data['xQTL'].values()]\n",
     "input: xqtl_files, group_by = lambda x: group_by_region(x, xqtl_files), group_with = \"meta_info\"\n",
-    "output: f'{cwd:a}/{step_name[:-2]}/{name}.{_meta_info[3]}.twas_mr.rds'\n",
+    "output: f'{cwd:a}/{step_name}/{name}.{_meta_info[3]}.twas_mr.rds'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
     "R: expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container, entrypoint = entrypoint\n",
     "    # we have potentially multiple weight db RDS files for each region of interest\n",
@@ -331,6 +346,8 @@
     "    end = ${_meta_info[2]}\n",
     "    region = \"${_meta_info[3]}\"\n",
     "    xqtl_conditions = c(${paths(_meta_info[4:]):r,})\n",
+    "    xqtl_conditions = unlist(strsplit(xqtl_conditions, \",\\\\s*\"))\n",
+    "    LD_meta_file_path = ${ld_meta_data:r}\n",
     "    gwas_studies = c(${paths(regional_data[\"GWAS\"].keys()):r,})\n",
     "    # load gwas data file for this particular chrom\n",
     "    gwas_files = c(${paths([v[_meta_info[0]] for k, v in regional_data[\"GWAS\"].items()]):r,})\n",
@@ -344,43 +361,43 @@
     "    gwas_data = list()\n",
     "    twas_mr_result = list()\n",
     "    for (s in seq_along(gwas_studies)) {\n",
-    "      gwas_data[[gwas_studies[s]]] = list()\n",
-    "      gwas_sumstats = fread(gwas_files[s])%>% \n",
+    "        gwas_data[[gwas_studies[s]]] = list()\n",
+    "        gwas_sumstats = fread(gwas_files[s])%>% \n",
     "                 rename(\"pos\" = \"position\", \"chrom\" = \"chromosome\", \"A1\" = \"ref\",\"A2\" = \"alt\")%>%\n",
     "                 mutate(z=beta/se)\n",
-    "      # Load LD list containing LD matrix and corresponding variants\n",
-    "      gwas_LD_list = load_LD_matrix(LD_meta_file_path, region_of_interest, gwas_sumstats)\n",
-    "      # Allele flip\n",
-    "      gwas_allele_flip= allele_qc(gwas_sumstats[,c(\"chrom\",\"pos\",\"A1\",\"A2\")], gwas_LD_list$combined_LD_variants,gwas_sumstats,c(\"beta\",\"se\",\"z\"))\n",
-    "      # Load LD matrix and sumstats\n",
-    "      gwas_data[[gwas_studies[s]]][[\"LD\"]] = gwas_LD_list$combined_LD_matrix\n",
-    "      gwas_data[[gwas_studies[s]]][[\"sumstats\"]] = gwas_allele_flip$target_data_qced\n",
-    "   for (condition in xqtl_conditions) {\n",
-    "      twas_mr_result[[gwas_studies[s]]][[condition]] = list()\n",
-    "      # Step 3: TWAS analysis\n",
-    "      # Step 3-1: Intersect with gwas summary statistics and adjust susie weights\n",
-    "      adjusted_susie_weights = adjust_susie_weights(twas_weights_results, condition,keep_variants = get_nested_element(gwas_data, c(gwas_studies[s], \"sumstats\", \"variant\")),allele_qc = ${\"TRUE\" if allele_qc else \"FALSE\"})\n",
-    "      # Step 3-2: Overlap weights of other methods with the variants name of adjusted_susie_weights, then combine with adjusted susie weights to obtain the subsetted weight matrix\n",
-    "      weights_matrix = get_nested_element(twas_weights_results, c(\"weights\", condition))\n",
-    "      weights_matrix_subset = cbind(susie_weights = adjusted_susie_weights$adjusted_susie_weights, weights_matrix[adjusted_susie_weights$remained_variants_ids,!colnames(weights_matrix) %in% \"susie_weights\"])\n",
-    "      if(allele_qc == ${\"TRUE\" if allele_qc else \"FALSE\"}){\n",
-    "            weights_matrix_qced <- allele_qc(rownames(weights_matrix_subset), gwas_LD_list$combined_LD_variants, weights_matrix_subset,1:ncol(weights_matrix))\n",
-    "            weights_matrix_subset <- weights_matrix_qced$target_data_qced[,!colnames(weights_matrix_qced$target_data_qced) %in% c(\"chrom\",\"pos\",\"A1\",\"A2\",\"variant_id\")]\n",
-    "            rownames(weights_matrix_subset) <- get_nested_element(weights_matrix_qced, c(\"target_data_qced\",\"variant_id\"))\n",
-    "      }\n",
-    "      # Step 3-3: Conduct twas analysis\n",
-    "      twas_result <- twas_analysis(weights_matrix_subset, gwas_data[[gwas_studies[s]]][[\"sumstats\"]],gwas_data[[gwas_studies[s]]][[\"LD\"]],rownames(weights_matrix_subset))\n",
-    "      # Step 4: Conduct mr analysis\n",
-    "      mr_formatted_input <- mr_format(twas_weights_results, condition, gwas_data[[gwas_studies[s]]][[\"sumstats\"]], sets = ${sets},coverage = ${coverage},allele_qc = ${\"TRUE\" if allele_qc else \"FALSE\"})\n",
-    "      mr_result <- mr_analysis(mr_formatted_input, cpip_cutoff = 0.5)\n",
-    "      # Step 5: Output the twas mr result\n",
-    "      twas_mr_result[[gwas_studies[s]]][[condition]] = list(twas_weights_matrix_subset = weights_matrix_subset, \n",
-    "                                                              mr_formatted_input = mr_formatted_input,\n",
-    "                                                                     twas_result = twas_result,\n",
-    "                                                                       mr_result = mr_result)\n",
-    "   }\n",
-    "}\n",
-    "saveRDS(twas_mr_result, ${_output:ar})"
+    "        # Load LD list containing LD matrix and corresponding variants\n",
+    "        gwas_LD_list = load_LD_matrix(LD_meta_file_path, region_of_interest, gwas_sumstats)\n",
+    "        # Allele flip\n",
+    "        gwas_allele_flip= allele_qc(gwas_sumstats[,c(\"chrom\",\"pos\",\"A1\",\"A2\")], gwas_LD_list$combined_LD_variants,gwas_sumstats,c(\"beta\",\"se\",\"z\"))\n",
+    "        # Load LD matrix and sumstats\n",
+    "        gwas_data[[gwas_studies[s]]][[\"LD\"]] = gwas_LD_list$combined_LD_matrix\n",
+    "        gwas_data[[gwas_studies[s]]][[\"sumstats\"]] = gwas_allele_flip$target_data_qced\n",
+    "        for (condition in xqtl_conditions) {\n",
+    "            twas_mr_result[[gwas_studies[s]]][[condition]] = list()\n",
+    "            # Step 3: TWAS analysis\n",
+    "            # Step 3-1: Intersect with gwas summary statistics and adjust susie weights\n",
+    "            adjusted_susie_weights = adjust_susie_weights(twas_weights_results, condition,keep_variants = get_nested_element(gwas_data, c(gwas_studies[s], \"sumstats\", \"variant_id\")),allele_qc = ${\"TRUE\" if allele_qc else \"FALSE\"})\n",
+    "            # Step 3-2: Overlap weights of other methods with the variants name of adjusted_susie_weights, then combine with adjusted susie weights to obtain the subsetted weight matrix\n",
+    "            weights_matrix = get_nested_element(twas_weights_results, c(\"weights\", condition))\n",
+    "            weights_matrix_subset = cbind(susie_weights = adjusted_susie_weights$adjusted_susie_weights, weights_matrix[adjusted_susie_weights$remained_variants_ids,!colnames(weights_matrix) %in% \"susie_weights\"])\n",
+    "            if(${\"TRUE\" if allele_qc else \"FALSE\"}){\n",
+    "                weights_matrix_qced <- allele_qc(rownames(weights_matrix_subset), gwas_LD_list$combined_LD_variants, weights_matrix_subset,1:ncol(weights_matrix))\n",
+    "                weights_matrix_subset <- weights_matrix_qced$target_data_qced[,!colnames(weights_matrix_qced$target_data_qced) %in% c(\"chrom\",\"pos\",\"A1\",\"A2\",\"variant_id\")]\n",
+    "                rownames(weights_matrix_subset) <- get_nested_element(weights_matrix_qced, c(\"target_data_qced\",\"variant_id\"))\n",
+    "            }\n",
+    "            # Step 3-3: Conduct twas analysis\n",
+    "            twas_result <- twas_analysis(weights_matrix_subset, gwas_data[[gwas_studies[s]]][[\"sumstats\"]],gwas_data[[gwas_studies[s]]][[\"LD\"]],rownames(weights_matrix_subset))\n",
+    "            # Step 4: Conduct mr analysis\n",
+    "            mr_formatted_input <- mr_format(twas_weights_results, condition, gwas_data[[gwas_studies[s]]][[\"sumstats\"]], coverage = \"${coverage}\", allele_qc = ${\"TRUE\" if allele_qc else \"FALSE\"})\n",
+    "            mr_result <- mr_analysis(mr_formatted_input, cpip_cutoff = 0.5)\n",
+    "            # Step 5: Output the twas mr result\n",
+    "            twas_mr_result[[gwas_studies[s]]][[condition]] = list(twas_weights_matrix_subset = weights_matrix_subset, \n",
+    "                                                                  mr_formatted_input = mr_formatted_input,\n",
+    "                                                                         twas_result = twas_result,\n",
+    "                                                                           mr_result = mr_result)\n",
+    "        }\n",
+    "    }\n",
+    "    saveRDS(twas_mr_result, ${_output:ar})"
    ]
   }
  ],

diff --git a/code/molecular_phenotypes/calling/RNA_calling.html b/code/molecular_phenotypes/calling/RNA_calling.html
@@ -1968,7 +1968,7 @@ <h3>Step Outputs:<a class="headerlink" href="#id8" title="Permalink to this head
 parameter: reference_fasta = path
 # For the patterned flowcell models (HiSeq X), change to 2500
 parameter: optical_distance = 100
-parameter: zap_raw_bam = True
+parameter: zap_raw_bam = False
 picard_strand_dict = {&quot;rf&quot;:&quot;SECOND_READ_TRANSCRIPTION_STRAND&quot;,&quot;fr&quot;: &quot;FIRST_READ_TRANSCRIPTION_STRAND&quot;,&quot;unstranded&quot;:&quot;NONE&quot; }
 input: output_from(&quot;STAR_align_1&quot;), group_by = 2, group_with = {&quot;sample_id&quot;,&quot;strand&quot;}
 output: picard_metrics = f&#39;{_sample_id}.alignment_summary_metrics&#39;,
@@ -2020,7 +2020,7 @@ <h3>Step Outputs:<a class="headerlink" href="#id8" title="Permalink to this head
         -VALIDATION_STRINGENCY STRICT \
         -REMOVE_SEQUENCING_DUPLICATES false \
         -REMOVE_DUPLICATES false
-
+    samtools index ${_output[2]}
     # Record end timestamp and calculate runtime for MarkDuplicates
     end_time=$(date +%s)
     runtime=$((end_time - start_time))
@@ -2044,6 +2044,7 @@ <h3>Step Outputs:<a class="headerlink" href="#id8" title="Permalink to this head
         strand = []
         tag = []
         annotation_gtf = &quot;${gtf}.tmp.${_input[&quot;cord_bam&quot;]:bnnnn}&quot;
+
         with open(annotation_gtf, &#39;r&#39;) as gtf:
             for row in gtf:
                 row = row.strip().split(&#39;\t&#39;)
@@ -2105,16 +2106,16 @@ <h3>Step Outputs:<a class="headerlink" href="#id8" title="Permalink to this head
     }
 
     export -f generate_bigwig  # To make it available in subshells, if needed.
-    generate_bigwig ${_output[&quot;md_bam&quot;]}
+    generate_bigwig $[_output[&quot;md_bam&quot;]]
 
 import pandas as pd
 out = pd.DataFrame({
     &quot;sample_id&quot;: _sample_id,
     &quot;strand&quot;: _strand,
-    &quot;coord_bam_list&quot;: _output[&quot;md_bam&quot;],
-    &quot;BW_list&quot;: _output[&quot;bigwig&quot;],
+    &quot;coord_bam_list&quot;: f&#39;{_output[&quot;md_bam&quot;]:b}&#39;,
+    &quot;BW_list&quot;: f&#39;{_output[&quot;bigwig&quot;]:b}&#39;,
     &quot;SJ_list&quot;: f&#39;{_output[&quot;md_bam&quot;]:bnnnnn}.SJ.out.tab&#39;,
-    &quot;trans_bam_list&quot;: f&#39;{_output[&quot;md_bam&quot;]:bnnnn}.toTranscriptome.out.bam&#39;
+    &quot;trans_bam_list&quot;: f&#39;{_output[&quot;md_bam&quot;]:bnnnn}.toTranscriptome.out_wasp_qc.bam&#39;
 
 })
 out.to_csv(_output[&quot;output_summary&quot;], sep=&quot;\t&quot;, index=False)

diff --git a/code/multivariate_genome/MASH/mash_preprocessing.html b/code/multivariate_genome/MASH/mash_preprocessing.html
@@ -851,9 +851,9 @@ <h3>Example<a class="headerlink" href="#example" title="Permalink to this headin
       renamed_res_reformatted &lt;- rename_rownames(res_reformatted)
       merged_data &lt;- list()
       for(region in names(renamed_res_reformatted)){
-        merged_data &lt;- merge_data(merged_data, renamed_res_reformatted[[region]])
+        merged_data &lt;- merge_mash_data(merged_data, renamed_res_reformatted[[region]])
       }
-      batch_combined_data &lt;- merge_data(batch_combined_data,merged_data)
+      batch_combined_data &lt;- merge_mash_data(batch_combined_data,merged_data)
     } 
    conditions = c(&quot;strong&quot;, &quot;random&quot;, &quot;null&quot;)
    for (cond in conditions){