Skip to content

Commit

Permalink
Update documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
gaow committed Mar 15, 2024
1 parent 576f8c0 commit c8bfeda
Show file tree
Hide file tree
Showing 13 changed files with 661 additions and 372 deletions.
12 changes: 6 additions & 6 deletions _sources/code/data_preprocessing/genotype/VCF_QC.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
"source": [
"sos run VCF_QC.ipynb rename_chrs \\\n",
" --genoFile reference_data/00-All.vcf.gz \\\n",
" --cwd reference_data --container bioinfo.sif"
" --cwd reference_data --container oras://ghcr.io/cumc/bioinfo_apptainer:latest"
]
},
{
Expand All @@ -174,7 +174,7 @@
"source": [
"sos run VCF_QC.ipynb dbsnp_annotate \\\n",
" --genoFile reference_data/00-All.add_chr.vcf.gz \\\n",
" --cwd reference_data --container bioinfo.sif"
" --cwd reference_data --container oras://ghcr.io/cumc/bioinfo_apptainer:latest"
]
},
{
Expand All @@ -193,7 +193,7 @@
" --genoFile data/MWE/MWE_genotype.vcf \\\n",
" --dbsnp-variants data/reference_data/00-All.add_chr.variants.gz \\\n",
" --reference-genome data/reference_data/GRCh38_full_analysis_set_plus_decoy_hla.noALT_noHLA_noDecoy_ERCC.fasta \\\n",
" --cwd MWE/output/genotype_1 --container bioinfo.sif -J 1 -c csg.yml -q csg"
" --cwd MWE/output/genotype_1 --container oras://ghcr.io/cumc/bioinfo_apptainer:latest"
]
},
{
Expand Down Expand Up @@ -222,7 +222,7 @@
" --genoFile data/mwe/mwe_genotype_list \\\n",
" --dbsnp-variants data/reference_data/00-All.add_chr.variants.gz \\\n",
" --reference-genome data/reference_data/GRCh38_full_analysis_set_plus_decoy_hla.noALT_noHLA_noDecoy_ERCC.fasta \\\n",
" --cwd MWE/output/genotype_4 --container bioinfo.sif --add-chr"
" --cwd MWE/output/genotype_4 --container oras://ghcr.io/cumc/bioinfo_apptainer:latest"
]
},
{
Expand Down Expand Up @@ -542,9 +542,9 @@
},
"outputs": [],
"source": [
"[rename_chrs: provides = '{filename}.add_chr.vcf.gz']\n",
"[rename_chrs: provides = '{genoFile:nn}.add_chr.vcf.gz']\n",
"# This file can be downloaded from https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh38p7/VCF/00-All.vcf.gz.\n",
"input: f'{filename}.vcf.gz'\n",
"input: genoFile\n",
"output: f'{_input:nn}.add_chr.vcf.gz'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
"bash: container = container, expand= \"${ }\", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout', entrypoint=entrypoint\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,7 @@
"parameter: sample_participant_lookup = path()\n",
"input: phenoFile, annotation_gtf\n",
"output: f'{cwd:a}/{_input[0]:bn}.formated.bed.gz', f'{cwd:a}/{_input[0]:bn}.phenotype_group.txt'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output[0]:bn}' \n",
"python: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container, entrypoint=entrypoint\n",
" import pandas as pd\n",
" import numpy as np\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,8 @@
" names(bed_file)[1] <- \"#Chr\"\n",
" bed_file$'#Chr' <- sub(\"^\", \"chr\", bed_file$'#Chr')\n",
" row.names(bed_file) <- NULL \n",
"\n",
" bed_file <- bed_file[order(bed_file$'#Chr',bed_file$start,bed_file$end),]\n",
" \n",
" # Create BED file output\n",
" write.table(x=bed_file, file = \"${cwd}/psichomics_raw_data_bedded.txt\", quote = FALSE, row.names = FALSE, sep = \"\\t\")"
Expand Down
411 changes: 244 additions & 167 deletions _sources/code/molecular_phenotypes/calling/RNA_calling.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@
" res <- list()\n",
" for (i in 1:nrow(meta_df)) {\n",
" line <- meta_df[i,]\n",
" region_file <- line[6]\n",
" sumstats_db_file <- line[7]\n",
" region_file <- line[7]\n",
" sumstats_db_file <- line[8]\n",
" strong_file <- load_multitrait_R_sumstat(readRDS(region_file), readRDS(sumstats_db_file), coverage = \"${coverage}\", top_loci=TRUE, exclude_condition = exclude_condition)\n",
" ran_null_file <- load_multitrait_R_sumstat(readRDS(region_file), readRDS(sumstats_db_file), filter_file=${independent_variant_list:r}, exclude_condition = exclude_condition)\n",
" ran_null <- mash_rand_null_sample(ran_null_file, ${n_random}, ${n_null}, exclude_condition = exclude_condition, seed=${seed})\n",
Expand Down
96 changes: 80 additions & 16 deletions _sources/code/post_processing/fine_mapping_post_processing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@
},
"outputs": [],
"source": [
"#susie\n",
"sos run xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n",
" --region_file ~/codes/fungen-xqtl-analysis/resource/TADB_enhanced_cis.protein_coding.bed \\\n",
" --file_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024_new/rds_files/ \\\n",
Expand All @@ -262,6 +263,28 @@
" --min_corr 0.8 \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f99ee4f4-b614-467c-8336-4471c749a975",
"metadata": {
"kernel": "SoS"
},
"outputs": [],
"source": [
"#fsusie\n",
"sos run xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n",
" --region_file /mnt/vast/hpc/homes/rf2872/codes/fungen-xqtl-analysis/resource/TADB_region.bed \\\n",
" --file_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024_new/rds_files/ \\\n",
" --name Fungen_xQTL \\\n",
" --cwd output \\\n",
" --suffix fsusie_mixture_normal_top_pc_weights.rds \\\n",
" --prefix ROSMAP_haQTL \\\n",
" --data_type qtl \\\n",
" --min_corr 0.8 \\\n",
" --fsusie True"
]
},
{
"cell_type": "markdown",
"id": "65f3b184-552a-4ca0-b11d-0e56516ed6ff",
Expand Down Expand Up @@ -313,10 +336,10 @@
"outputs": [],
"source": [
"sos run ~/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb overlap_qtl_gwas \\\n",
" --qtl-files `ls /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024/data/*/*export.rds ` \\\n",
" --gwas-union-file /mnt/vast/hpc/csg/rf2872/Work/Multivariate/gwas/output/ADGWAS.union_export.tsv.gz \\\n",
" --name ROSMAP_eQTL \\\n",
" -c ~/env_files/csg.yml -J 50"
" --qtl_meta_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/gwas/output/Fungen_xQTL.cis_results_db.tsv \\\n",
" --gwas_meta_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/gwas/ADGWAS_Mar7/ADGWAS_Feb22.block_results_db.tsv \\\n",
" --cwd overlap_test"
]
},
{
Expand Down Expand Up @@ -1390,6 +1413,7 @@
"# the analysis would be focused on the union of provides region list and region names\n",
"parameter: region_name = []\n",
"parameter: min_corr = 0.5\n",
"parameter: fsusie = False\n",
"import pandas as pd\n",
"import os\n",
"\n",
Expand Down Expand Up @@ -1535,7 +1559,30 @@
" names(pip) = susie_obj$variant_names\n",
" return(pip)\n",
" }\n",
" \n",
" # add column in top loci to indicate if the variant is identified in susie_on_top_pc top loci table or not\n",
" annotate_susie <- function(obj, top_loci){\n",
" df <- data.frame()\n",
"\n",
" results <- lapply(obj[['susie_on_top_pc']], function(x) {\n",
" x[['top_loci']]\n",
" })\n",
"\n",
" df <- bind_rows(results)\n",
"\n",
" top_loci_others <- df %>%\n",
" filter(rowSums(select(., starts_with('cs_coverage_'))) == 0)\n",
" top_loci_cs <- df %>%\n",
" filter(rowSums(select(., starts_with('cs_coverage_'))) > 0) \n",
"\n",
" susie_vars <- rbind(top_loci_others %>% filter(pip >= 0.05), top_loci_cs) %>% pull(variant_id)\n",
" susie_vars_cs <- top_loci_cs %>% pull(variant_id)\n",
" top_loci <- top_loci %>% mutate(annotated_susie = ifelse(variant_id %in% susie_vars, 1, 0),\n",
" annotated_susie_cs = ifelse(variant_id %in% susie_vars_cs, 1, 0))\n",
"\n",
" return(top_loci)\n",
" }\n",
" \n",
" # Process each path and collect results\n",
" orig_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input])})\n",
" # Extract info from each RDS file\n",
Expand Down Expand Up @@ -1568,8 +1615,9 @@
" if(length(conditions) > 0) {\n",
" \n",
" for(condition in conditions) {\n",
" dat_con <- dat_study[[condition]]\n",
" dat_susie <- dat_con$susie_result_trimmed\n",
" dat_con <- dat_study[[condition]]${[['fsusie_summary']] if fsusie else ''}\n",
" dat_susie <- dat_con$susie_result_trimme\n",
"\n",
" \n",
" if(${\"TRUE\" if condition_meta.is_file() else \"FALSE\"}){\n",
" meta <- suppressMessages(read_delim(\"${condition_meta}\", col_names = F))\n",
Expand All @@ -1583,11 +1631,20 @@
" res[[gene]][[s]][[qtl_con]] <- list(\n",
" region_info = dat_con$region_info,\n",
" top_loci = process_top_loci(dat_susie, dat_con$top_loci),\n",
" preset_top_loci = process_top_loci(dat_susie, dat_con$preset_variants_result$top_loci),\n",
" pip = get_pip_withname(dat_con),\n",
" preset_pip = get_pip_withname(dat_con$preset_variants_result),\n",
" CV_table = dat_con$twas_cv_result$performance\n",
" )\n",
" if ('${data_type}' == 'qtl') { \n",
" # fsusie saved preset results in different layer\n",
" if (${\"TRUE\" if fsusie else \"FALSE\"}){\n",
" res[[gene]][[s]][[qtl_con]][['top_loci']] = annotate_susie(dat_study[[condition]], res[[gene]][[s]][[qtl_con]][['top_loci']])\n",
" res[[gene]][[s]][[qtl_con]][['preset_top_loci']] = process_top_loci(dat_susie, dat_study[[condition]]$preset_variants_result$top_loci) %>% annotate_susie(dat_study[[condition]], .)\n",
" res[[gene]][[s]][[qtl_con]][['preset_pip']] = get_pip_withname(dat_study[[condition]]$preset_variants_result)\n",
" } else {\n",
" res[[gene]][[s]][[qtl_con]][['preset_top_loci']] = process_top_loci(dat_susie, dat_con$preset_variants_result$top_loci)\n",
" res[[gene]][[s]][[qtl_con]][['preset_pip']] = get_pip_withname(dat_con$preset_variants_result)\n",
" }\n",
" }\n",
" \n",
" res_sum[[gene]][[s]][[qtl_con]] <- list(\n",
" variant_names = dat_con$variant_names,\n",
Expand Down Expand Up @@ -1618,10 +1675,12 @@
" saveRDS(res, combine_data)\n",
" saveRDS(res_sum, combine_data_sumstats)\n",
" TSS <- tryCatch({dat_con$region_info$region_coord$start}, error = function(e) {return(NA)})\n",
" \n",
" if (${\"TRUE\" if fsusie else \"FALSE\"}) conditions = paste(names(res[[1]]), collapse = \",\") else conditions = paste(names(res[[gene]]), collapse = \",\")\n",
" \n",
" meta = data.frame(chr=\"${_meta_info[0]}\", start=\"${_meta_info[1]}\", end=\"${_meta_info[2]}\", gene_id=\"${_meta_info[3]}\", TSS = if(is.null(TSS)) NA else TSS, \n",
" original_data = paste(basename(orig_files), collapse = \", \"), combined_data = basename(combine_data), combined_data_sumstats = basename(combine_data_sumstats), \n",
" conditions = paste(names(res[[gene]]), collapse = \",\"), \n",
" meta = data.frame(chr=\"${_meta_info[0]}\", start=\"${_meta_info[1]}\", end=\"${_meta_info[2]}\", region_id=\"${_meta_info[3]}\", TSS = if(is.null(TSS)) NA else TSS, \n",
" original_data = paste(basename(orig_files), collapse = \",\"), combined_data = basename(combine_data), combined_data_sumstats = basename(combine_data_sumstats), \n",
" conditions = conditions, \n",
" conditions_top_loci = if(length(cons_top_loci) > 0) cons_top_loci[[1]] %>% unlist %>% names %>% as.character %>% paste(., collapse = ',') else '')\n",
" write_delim(meta, \"${_output}\", delim = '\\t')\n",
" "
Expand Down Expand Up @@ -1668,7 +1727,7 @@
" if (${\"TRUE\" if exported_file.is_file() else \"FALSE\"}){\n",
" exp_meta <- read_delim(${_output:r}, delim = '\\t')\n",
" meta <- bind_rows(meta, exp_meta) %>%\n",
" group_by(gene_id) %>%\n",
" group_by(region_id) %>%\n",
" summarise(across(c(original_data, combined_data, combined_data_sumstats, conditions, conditions_top_loci), \n",
" ~paste(unique(.), collapse = \",\")),\n",
" .groups = 'drop')\n",
Expand Down Expand Up @@ -1761,9 +1820,9 @@
"[overlap_qtl_gwas_1]\n",
"parameter: per_chunk = 100\n",
"parameter: gwas_meta_path = path()\n",
"parameter: qtl_file_path = ''\n",
"parameter: gwas_file_path = ''\n",
"parameter: qtl_meta_path = path()\n",
"parameter: gwas_file_path = ''\n",
"parameter: qtl_file_path = ''\n",
"# Optional: if a region list is provide the analysis will be focused on provided region. \n",
"# The LAST column of this list will contain the ID of regions to focus on\n",
"# Otherwise, all regions with both genotype and phenotype files will be analyzed\n",
Expand All @@ -1775,6 +1834,11 @@
"import numpy as np\n",
"from pathlib import Path\n",
"\n",
"if qtl_file_path == '':\n",
" qtl_file_path = qtl_meta_path.parent\n",
"if gwas_file_path == '':\n",
" gwas_file_path = gwas_meta_path.parent\n",
" \n",
"# Load the data, suppressing messages is not typically done in pandas as it does not inherently output messages when loading files\n",
"gwas_meta = pd.read_csv(gwas_meta_path, sep='\\t', low_memory=False)\n",
"gwas_meta = gwas_meta[gwas_meta['conditions_top_loci'].notna()]\n",
Expand All @@ -1798,7 +1862,7 @@
" region_ids = list(set(region_ids).union(set(region_name)))\n",
" \n",
"if len(region_ids) > 0:\n",
" qtl_meta = qtl_meta[qtl_meta['gene_id'].isin(region_ids)]\n",
" qtl_meta = qtl_meta[qtl_meta['region_id'].isin(region_ids)]\n",
" \n",
"def group_by_region(lst, partition):\n",
" # from itertools import accumulate\n",
Expand Down Expand Up @@ -1835,7 +1899,7 @@
"\n",
"\n",
"regional_data = {\n",
" 'meta': [(row['#chr'], row['start'], row['end'], row['gene_id'], row['gwas_file'].split(',')) for _, row in qtl_meta_filtered.iterrows()],\n",
" 'meta': [(row['#chr'], row['start'], row['end'], row['region_id'], row['gwas_file'].split(',')) for _, row in qtl_meta_filtered.iterrows()],\n",
" 'qtl_data': [f\"{qtl_file_path}/{row['combined_data']}\" for _, row in qtl_meta_filtered.iterrows()]\n",
"}\n",
"\n",
Expand Down Expand Up @@ -1981,7 +2045,7 @@
" }\n",
" \n",
" qtl_meta <- suppressMessages(read_delim('${qtl_meta_path}'))\n",
" qtl_meta <- qtl_meta %>% filter(gene_id == '${_meta_info[3]}') %>% mutate(block_top_loci = block_top_loci,\n",
" qtl_meta <- qtl_meta %>% filter(region_id == '${_meta_info[3]}') %>% mutate(block_top_loci = block_top_loci,\n",
" final_combined_data = final_combined_data)\n",
" fwrite(qtl_meta, ${_output:r})"
]
Expand Down
Loading

0 comments on commit c8bfeda

Please sign in to comment.