Skip to content

Commit

Permalink
Update documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
gaow committed Mar 18, 2024
1 parent c8bfeda commit 36de0ae
Show file tree
Hide file tree
Showing 9 changed files with 134 additions and 104 deletions.
15 changes: 8 additions & 7 deletions _sources/code/molecular_phenotypes/calling/RNA_calling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1372,7 +1372,7 @@
"bash: container=container,expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', entrypoint=entrypoint\n",
" fastp -i ${f'{_input[0]} -I {_input[1]}' if is_paired_end else _input} -o ${ f'{_output[0]} -O {_output[1]}' if is_paired_end else _output } \\\n",
" ${f'--adapter_fasta {fasta_with_adapters_etc}' if fasta_with_adapters_etc.is_file() else \"--detect_adapter_for_pe\"} -V -h ${_output[0]:n}.html -j ${_output[0]:n}.json -w ${numThreads} \\\n",
" --length_required ${min_len} -W ${window_size} -M ${required_quality} -5 -3 --cut_front_mean_quality ${leading} --cut_tail_mean_quality ${leading}\n"
" --length_required ${min_len} -W ${window_size} -M ${required_quality} -5 -3 --cut_front_mean_quality ${leading} --cut_tail_mean_quality ${leading}"
]
},
{
Expand Down Expand Up @@ -1769,7 +1769,7 @@
"parameter: reference_fasta = path\n",
"# For the patterned flowcell models (HiSeq X), change to 2500\n",
"parameter: optical_distance = 100\n",
"parameter: zap_raw_bam = True\n",
"parameter: zap_raw_bam = False\n",
"picard_strand_dict = {\"rf\":\"SECOND_READ_TRANSCRIPTION_STRAND\",\"fr\": \"FIRST_READ_TRANSCRIPTION_STRAND\",\"unstranded\":\"NONE\" }\n",
"input: output_from(\"STAR_align_1\"), group_by = 2, group_with = {\"sample_id\",\"strand\"}\n",
"output: picard_metrics = f'{_sample_id}.alignment_summary_metrics',\n",
Expand Down Expand Up @@ -1821,7 +1821,7 @@
" -VALIDATION_STRINGENCY STRICT \\\n",
" -REMOVE_SEQUENCING_DUPLICATES false \\\n",
" -REMOVE_DUPLICATES false\n",
" \n",
" samtools index ${_output[2]}\n",
" # Record end timestamp and calculate runtime for MarkDuplicates\n",
" end_time=$(date +%s)\n",
" runtime=$((end_time - start_time))\n",
Expand All @@ -1845,6 +1845,7 @@
" strand = []\n",
" tag = []\n",
" annotation_gtf = \"${gtf}.tmp.${_input[\"cord_bam\"]:bnnnn}\"\n",
"\n",
" with open(annotation_gtf, 'r') as gtf:\n",
" for row in gtf:\n",
" row = row.strip().split('\\t')\n",
Expand Down Expand Up @@ -1906,16 +1907,16 @@
" }\n",
"\n",
" export -f generate_bigwig # To make it available in subshells, if needed.\n",
" generate_bigwig ${_output[\"md_bam\"]}\n",
" generate_bigwig $[_output[\"md_bam\"]]\n",
"\n",
"import pandas as pd\n",
"out = pd.DataFrame({\n",
" \"sample_id\": _sample_id,\n",
" \"strand\": _strand,\n",
" \"coord_bam_list\": _output[\"md_bam\"],\n",
" \"BW_list\": _output[\"bigwig\"],\n",
" \"coord_bam_list\": f'{_output[\"md_bam\"]:b}',\n",
" \"BW_list\": f'{_output[\"bigwig\"]:b}',\n",
" \"SJ_list\": f'{_output[\"md_bam\"]:bnnnnn}.SJ.out.tab',\n",
" \"trans_bam_list\": f'{_output[\"md_bam\"]:bnnnn}.toTranscriptome.out.bam'\n",
" \"trans_bam_list\": f'{_output[\"md_bam\"]:bnnnn}.toTranscriptome.out_wasp_qc.bam'\n",
"\n",
"})\n",
"out.to_csv(_output[\"output_summary\"], sep=\"\\t\", index=False)\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -405,9 +405,9 @@
" renamed_res_reformatted <- rename_rownames(res_reformatted)\n",
" merged_data <- list()\n",
" for(region in names(renamed_res_reformatted)){\n",
" merged_data <- merge_data(merged_data, renamed_res_reformatted[[region]])\n",
" merged_data <- merge_mash_data(merged_data, renamed_res_reformatted[[region]])\n",
" }\n",
" batch_combined_data <- merge_data(batch_combined_data,merged_data)\n",
" batch_combined_data <- merge_mash_data(batch_combined_data,merged_data)\n",
" } \n",
" conditions = c(\"strong\", \"random\", \"null\")\n",
" for (cond in conditions){\n",
Expand Down
9 changes: 4 additions & 5 deletions _sources/code/multivariate_genome/MASH/mixture_prior.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -748,14 +748,13 @@
" fit0 <- ud_init(mash_data, n_unconstrained = 50, U_scaled = U.can)\n",
" # Fit udr and use penalty as default as suggested by Yunqi\n",
" # penalty is necessary in small sample size case, and there won't be a difference in large sample size \n",
" fit2 = ud_fit(fit0, control = list(unconstrained.update = \"ted\", scaled.update = \"fa\", resid.update = 'none', \n",
" fit2 = ud_fit(fit0, control = list(unconstrained.update = \"${unconstrained_prior}\", scaled.update = \"fa\", resid.update = 'none', \n",
" lambda =lambda, penalty.type = \"iw\", maxiter=1e3, tol = 1e-2, tol.lik = 1e-2))\n",
"\n",
" # extract data-driven covariance from udr model. (A list of covariance matrices)\n",
" U.ud <- lapply(fit2$U,function (e) \"[[\"(e,\"mat\")) \n",
"\n",
" saveRDS(list(U=U.ud, w=fit2$w, loglik=fit2$loglik), ${_output:r})\n",
"\n"
" saveRDS(list(U=U.ud, w=fit2$w, loglik=fit2$loglik), ${_output:r})"
]
},
{
Expand All @@ -766,7 +765,7 @@
},
"outputs": [],
"source": [
"[ted]\n",
"[ud_unconstrained]\n",
"# Method is `ed` or `ted`\n",
"parameter: ud_method = \"ed\"\n",
"# A typical choice is to estimate scales only for canonical components\n",
Expand Down Expand Up @@ -964,7 +963,7 @@
"SoS"
]
],
"version": "0.24.0"
"version": "0.24.3"
}
},
"nbformat": 4,
Expand Down
95 changes: 56 additions & 39 deletions _sources/code/pecotmr_integration/twas_mr.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,22 @@
" - `I2`: $I^2$ statistics"
]
},
{
"cell_type": "markdown",
"metadata": {
"kernel": "SoS"
},
"source": [
"## Example\n",
"```\n",
"sos run ~/githubrepo/xqtl-pipeline/code/pecotmr_integration/twas_mr.ipynb twas_mr \\\n",
" --cwd /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/output/ \\\n",
" --gwas_meta_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/gwas_meta.tsv \\\n",
" --ld_meta_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/ld_meta_file.tsv \\\n",
" --xqtl_meta_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/xqtl_meta.tsv \n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -285,7 +301,7 @@
" chrom_range = range(1, 23) if row['chrom'] == 0 else [row['chrom']]\n",
" for chrom in chrom_range:\n",
" if chrom in gwas_dict[row['study_id']]:\n",
" e existing_entry = gwas_dict[row['study_id']][chrom]\n",
" existing_entry = gwas_dict[row['study_id']][chrom]\n",
" raise ValueError(f\"Duplicate chromosome specification for study_id {row['study_id']}, chrom {chrom}. \"\n",
" f\"Conflicting entries: {existing_entry} and {[file_path, mapping_file]}\")\n",
" gwas_dict[row['study_id']][chrom] = [file_path, mapping_file]\n",
Expand Down Expand Up @@ -316,12 +332,11 @@
"depends: sos_variable(\"regional_data\")\n",
"\n",
"parameter: allele_qc = True\n",
"parameter: sets = \"sets\"\n",
"parameter: coverage = \"coverage_0.95\"\n",
"parameter: coverage = \"cs_coverage_0.95\"\n",
"meta_info = [x[\"meta_info\"] for x in regional_data['xQTL'].values()]\n",
"xqtl_files = [x[\"files\"] for x in regional_data['xQTL'].values()]\n",
"input: xqtl_files, group_by = lambda x: group_by_region(x, xqtl_files), group_with = \"meta_info\"\n",
"output: f'{cwd:a}/{step_name[:-2]}/{name}.{_meta_info[3]}.twas_mr.rds'\n",
"output: f'{cwd:a}/{step_name}/{name}.{_meta_info[3]}.twas_mr.rds'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
"R: expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container, entrypoint = entrypoint\n",
" # we have potentially multiple weight db RDS files for each region of interest\n",
Expand All @@ -331,6 +346,8 @@
" end = ${_meta_info[2]}\n",
" region = \"${_meta_info[3]}\"\n",
" xqtl_conditions = c(${paths(_meta_info[4:]):r,})\n",
" xqtl_conditions = unlist(strsplit(xqtl_conditions, \",\\\\s*\"))\n",
" LD_meta_file_path = ${ld_meta_data:r}\n",
" gwas_studies = c(${paths(regional_data[\"GWAS\"].keys()):r,})\n",
" # load gwas data file for this particular chrom\n",
" gwas_files = c(${paths([v[_meta_info[0]] for k, v in regional_data[\"GWAS\"].items()]):r,})\n",
Expand All @@ -344,43 +361,43 @@
" gwas_data = list()\n",
" twas_mr_result = list()\n",
" for (s in seq_along(gwas_studies)) {\n",
" gwas_data[[gwas_studies[s]]] = list()\n",
" gwas_sumstats = fread(gwas_files[s])%>% \n",
" gwas_data[[gwas_studies[s]]] = list()\n",
" gwas_sumstats = fread(gwas_files[s])%>% \n",
" rename(\"pos\" = \"position\", \"chrom\" = \"chromosome\", \"A1\" = \"ref\",\"A2\" = \"alt\")%>%\n",
" mutate(z=beta/se)\n",
" # Load LD list containing LD matrix and corresponding variants\n",
" gwas_LD_list = load_LD_matrix(LD_meta_file_path, region_of_interest, gwas_sumstats)\n",
" # Allele flip\n",
" gwas_allele_flip= allele_qc(gwas_sumstats[,c(\"chrom\",\"pos\",\"A1\",\"A2\")], gwas_LD_list$combined_LD_variants,gwas_sumstats,c(\"beta\",\"se\",\"z\"))\n",
" # Load LD matrix and sumstats\n",
" gwas_data[[gwas_studies[s]]][[\"LD\"]] = gwas_LD_list$combined_LD_matrix\n",
" gwas_data[[gwas_studies[s]]][[\"sumstats\"]] = gwas_allele_flip$target_data_qced\n",
" for (condition in xqtl_conditions) {\n",
" twas_mr_result[[gwas_studies[s]]][[condition]] = list()\n",
" # Step 3: TWAS analysis\n",
" # Step 3-1: Intersect with gwas summary statistics and adjust susie weights\n",
" adjusted_susie_weights = adjust_susie_weights(twas_weights_results, condition,keep_variants = get_nested_element(gwas_data, c(gwas_studies[s], \"sumstats\", \"variant\")),allele_qc = ${\"TRUE\" if allele_qc else \"FALSE\"})\n",
" # Step 3-2: Overlap weights of other methods with the variants name of adjusted_susie_weights, then combine with adjusted susie weights to obtain the subsetted weight matrix\n",
" weights_matrix = get_nested_element(twas_weights_results, c(\"weights\", condition))\n",
" weights_matrix_subset = cbind(susie_weights = adjusted_susie_weights$adjusted_susie_weights, weights_matrix[adjusted_susie_weights$remained_variants_ids,!colnames(weights_matrix) %in% \"susie_weights\"])\n",
" if(allele_qc == ${\"TRUE\" if allele_qc else \"FALSE\"}){\n",
" weights_matrix_qced <- allele_qc(rownames(weights_matrix_subset), gwas_LD_list$combined_LD_variants, weights_matrix_subset,1:ncol(weights_matrix))\n",
" weights_matrix_subset <- weights_matrix_qced$target_data_qced[,!colnames(weights_matrix_qced$target_data_qced) %in% c(\"chrom\",\"pos\",\"A1\",\"A2\",\"variant_id\")]\n",
" rownames(weights_matrix_subset) <- get_nested_element(weights_matrix_qced, c(\"target_data_qced\",\"variant_id\"))\n",
" }\n",
" # Step 3-3: Conduct twas analysis\n",
" twas_result <- twas_analysis(weights_matrix_subset, gwas_data[[gwas_studies[s]]][[\"sumstats\"]],gwas_data[[gwas_studies[s]]][[\"LD\"]],rownames(weights_matrix_subset))\n",
" # Step 4: Conduct mr analysis\n",
" mr_formatted_input <- mr_format(twas_weights_results, condition, gwas_data[[gwas_studies[s]]][[\"sumstats\"]], sets = ${sets},coverage = ${coverage},allele_qc = ${\"TRUE\" if allele_qc else \"FALSE\"})\n",
" mr_result <- mr_analysis(mr_formatted_input, cpip_cutoff = 0.5)\n",
" # Step 5: Output the twas mr result\n",
" twas_mr_result[[gwas_studies[s]]][[condition]] = list(twas_weights_matrix_subset = weights_matrix_subset, \n",
" mr_formatted_input = mr_formatted_input,\n",
" twas_result = twas_result,\n",
" mr_result = mr_result)\n",
" }\n",
"}\n",
"saveRDS(twas_mr_result, ${_output:ar})"
" # Load LD list containing LD matrix and corresponding variants\n",
" gwas_LD_list = load_LD_matrix(LD_meta_file_path, region_of_interest, gwas_sumstats)\n",
" # Allele flip\n",
" gwas_allele_flip= allele_qc(gwas_sumstats[,c(\"chrom\",\"pos\",\"A1\",\"A2\")], gwas_LD_list$combined_LD_variants,gwas_sumstats,c(\"beta\",\"se\",\"z\"))\n",
" # Load LD matrix and sumstats\n",
" gwas_data[[gwas_studies[s]]][[\"LD\"]] = gwas_LD_list$combined_LD_matrix\n",
" gwas_data[[gwas_studies[s]]][[\"sumstats\"]] = gwas_allele_flip$target_data_qced\n",
" for (condition in xqtl_conditions) {\n",
" twas_mr_result[[gwas_studies[s]]][[condition]] = list()\n",
" # Step 3: TWAS analysis\n",
" # Step 3-1: Intersect with gwas summary statistics and adjust susie weights\n",
" adjusted_susie_weights = adjust_susie_weights(twas_weights_results, condition,keep_variants = get_nested_element(gwas_data, c(gwas_studies[s], \"sumstats\", \"variant_id\")),allele_qc = ${\"TRUE\" if allele_qc else \"FALSE\"})\n",
" # Step 3-2: Overlap weights of other methods with the variants name of adjusted_susie_weights, then combine with adjusted susie weights to obtain the subsetted weight matrix\n",
" weights_matrix = get_nested_element(twas_weights_results, c(\"weights\", condition))\n",
" weights_matrix_subset = cbind(susie_weights = adjusted_susie_weights$adjusted_susie_weights, weights_matrix[adjusted_susie_weights$remained_variants_ids,!colnames(weights_matrix) %in% \"susie_weights\"])\n",
" if(${\"TRUE\" if allele_qc else \"FALSE\"}){\n",
" weights_matrix_qced <- allele_qc(rownames(weights_matrix_subset), gwas_LD_list$combined_LD_variants, weights_matrix_subset,1:ncol(weights_matrix))\n",
" weights_matrix_subset <- weights_matrix_qced$target_data_qced[,!colnames(weights_matrix_qced$target_data_qced) %in% c(\"chrom\",\"pos\",\"A1\",\"A2\",\"variant_id\")]\n",
" rownames(weights_matrix_subset) <- get_nested_element(weights_matrix_qced, c(\"target_data_qced\",\"variant_id\"))\n",
" }\n",
" # Step 3-3: Conduct twas analysis\n",
" twas_result <- twas_analysis(weights_matrix_subset, gwas_data[[gwas_studies[s]]][[\"sumstats\"]],gwas_data[[gwas_studies[s]]][[\"LD\"]],rownames(weights_matrix_subset))\n",
" # Step 4: Conduct mr analysis\n",
" mr_formatted_input <- mr_format(twas_weights_results, condition, gwas_data[[gwas_studies[s]]][[\"sumstats\"]], coverage = \"${coverage}\", allele_qc = ${\"TRUE\" if allele_qc else \"FALSE\"})\n",
" mr_result <- mr_analysis(mr_formatted_input, cpip_cutoff = 0.5)\n",
" # Step 5: Output the twas mr result\n",
" twas_mr_result[[gwas_studies[s]]][[condition]] = list(twas_weights_matrix_subset = weights_matrix_subset, \n",
" mr_formatted_input = mr_formatted_input,\n",
" twas_result = twas_result,\n",
" mr_result = mr_result)\n",
" }\n",
" }\n",
" saveRDS(twas_mr_result, ${_output:ar})"
]
}
],
Expand Down
13 changes: 7 additions & 6 deletions code/molecular_phenotypes/calling/RNA_calling.html
Original file line number Diff line number Diff line change
Expand Up @@ -1968,7 +1968,7 @@ <h3>Step Outputs:<a class="headerlink" href="#id8" title="Permalink to this head
parameter: reference_fasta = path
# For the patterned flowcell models (HiSeq X), change to 2500
parameter: optical_distance = 100
parameter: zap_raw_bam = True
parameter: zap_raw_bam = False
picard_strand_dict = {&quot;rf&quot;:&quot;SECOND_READ_TRANSCRIPTION_STRAND&quot;,&quot;fr&quot;: &quot;FIRST_READ_TRANSCRIPTION_STRAND&quot;,&quot;unstranded&quot;:&quot;NONE&quot; }
input: output_from(&quot;STAR_align_1&quot;), group_by = 2, group_with = {&quot;sample_id&quot;,&quot;strand&quot;}
output: picard_metrics = f&#39;{_sample_id}.alignment_summary_metrics&#39;,
Expand Down Expand Up @@ -2020,7 +2020,7 @@ <h3>Step Outputs:<a class="headerlink" href="#id8" title="Permalink to this head
-VALIDATION_STRINGENCY STRICT \
-REMOVE_SEQUENCING_DUPLICATES false \
-REMOVE_DUPLICATES false

samtools index ${_output[2]}
# Record end timestamp and calculate runtime for MarkDuplicates
end_time=$(date +%s)
runtime=$((end_time - start_time))
Expand All @@ -2044,6 +2044,7 @@ <h3>Step Outputs:<a class="headerlink" href="#id8" title="Permalink to this head
strand = []
tag = []
annotation_gtf = &quot;${gtf}.tmp.${_input[&quot;cord_bam&quot;]:bnnnn}&quot;

with open(annotation_gtf, &#39;r&#39;) as gtf:
for row in gtf:
row = row.strip().split(&#39;\t&#39;)
Expand Down Expand Up @@ -2105,16 +2106,16 @@ <h3>Step Outputs:<a class="headerlink" href="#id8" title="Permalink to this head
}

export -f generate_bigwig # To make it available in subshells, if needed.
generate_bigwig ${_output[&quot;md_bam&quot;]}
generate_bigwig $[_output[&quot;md_bam&quot;]]

import pandas as pd
out = pd.DataFrame({
&quot;sample_id&quot;: _sample_id,
&quot;strand&quot;: _strand,
&quot;coord_bam_list&quot;: _output[&quot;md_bam&quot;],
&quot;BW_list&quot;: _output[&quot;bigwig&quot;],
&quot;coord_bam_list&quot;: f&#39;{_output[&quot;md_bam&quot;]:b}&#39;,
&quot;BW_list&quot;: f&#39;{_output[&quot;bigwig&quot;]:b}&#39;,
&quot;SJ_list&quot;: f&#39;{_output[&quot;md_bam&quot;]:bnnnnn}.SJ.out.tab&#39;,
&quot;trans_bam_list&quot;: f&#39;{_output[&quot;md_bam&quot;]:bnnnn}.toTranscriptome.out.bam&#39;
&quot;trans_bam_list&quot;: f&#39;{_output[&quot;md_bam&quot;]:bnnnn}.toTranscriptome.out_wasp_qc.bam&#39;

})
out.to_csv(_output[&quot;output_summary&quot;], sep=&quot;\t&quot;, index=False)
Expand Down
4 changes: 2 additions & 2 deletions code/multivariate_genome/MASH/mash_preprocessing.html
Original file line number Diff line number Diff line change
Expand Up @@ -851,9 +851,9 @@ <h3>Example<a class="headerlink" href="#example" title="Permalink to this headin
renamed_res_reformatted &lt;- rename_rownames(res_reformatted)
merged_data &lt;- list()
for(region in names(renamed_res_reformatted)){
merged_data &lt;- merge_data(merged_data, renamed_res_reformatted[[region]])
merged_data &lt;- merge_mash_data(merged_data, renamed_res_reformatted[[region]])
}
batch_combined_data &lt;- merge_data(batch_combined_data,merged_data)
batch_combined_data &lt;- merge_mash_data(batch_combined_data,merged_data)
}
conditions = c(&quot;strong&quot;, &quot;random&quot;, &quot;null&quot;)
for (cond in conditions){
Expand Down
Loading

0 comments on commit 36de0ae

Please sign in to comment.