Skip to content

Commit

Permalink
Update documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
gaow committed Mar 11, 2024
1 parent a211e7a commit 576f8c0
Show file tree
Hide file tree
Showing 19 changed files with 1,568 additions and 1,728 deletions.
8 changes: 4 additions & 4 deletions _sources/code/association_scan/TensorQTL/TensorQTL.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,7 @@
" cis_list = pd.read_csv(\"$[customized_cis_windows:a]\", comment=\"#\", header=None, names=[\"chr\",\"start\",\"end\",phenotype_id], sep=\"\\t\")\n",
" phenotype_pos_df_reset = phenotype_pos_df.reset_index() #move the phenotype id index to a new column of the dataframe\n",
" phenotype_pos_df = phenotype_pos_df_reset.merge(cis_list, left_on = [\"chr\",phenotype_id],right_on = [cis_list.columns[0],cis_list.columns[3]])#in some cases (gene expression for eQTLs) the phenotype_id may be in the cis_list file\n",
" if len(phenotype_df.index) - phenotype_pos_df_reset[~phenotype_pos_df_reset['ID'].isin(cis_list['ID'])].shape[0]!= len(phenotype_pos_df.index):\n",
" if len(phenotype_df.index) - phenotype_pos_df_reset[~phenotype_pos_df_reset[phenotype_id].isin(cis_list[phenotype_id])].shape[0]!= len(phenotype_pos_df.index):\n",
" raise ValueError(\"cannot uniquely match all the phentoype data in the input to the customized cis windows provided\")\n",
" phenotype_pos_df = phenotype_pos_df.set_index(phenotype_id)[[\"chr\",\"start\",\"end\"]] # The final phenotype_pos_df will have three columns(chr, start, end) and index is the phenotype ID\n",
" window = 0 # In the updated tensorQTL, by default if there is a customized cis window, the actual cis window will be start - window & end + window, so it is necessary to change the window parameter to 0\n",
Expand All @@ -747,8 +747,8 @@
" variant_df = pr.bim.set_index('snp')[['chrom', 'pos','a0','a1']]\n",
"\n",
" ## Retaining only traits in cis_list\n",
" if phenotype_pos_df_reset[~phenotype_pos_df_reset['ID'].isin(cis_list['ID'])].shape[0] != 0:\n",
" phenotype_df = phenotype_df.loc[phenotype_df.index.isin(cis_list['ID'])]\n",
" if phenotype_pos_df_reset[~phenotype_pos_df_reset[phenotype_id].isin(cis_list[phenotype_id])].shape[0] != 0:\n",
" phenotype_df = phenotype_df.loc[phenotype_df.index.isin(cis_list[phenotype_id])]\n",
"\n",
" ## Retaining only common samples\n",
" phenotype_df = phenotype_df[np.intersect1d(phenotype_df.columns, covariates_df.index)]\n",
Expand Down Expand Up @@ -789,7 +789,7 @@
" trans_df.columns.values[4] = \"se\"\n",
" trans_df = trans_df.merge(variant_df,left_on = \"variant_id\", right_index=True)\n",
" trans_df.to_csv(\"$[_output]\", sep='\\t',index = None, compression={'method': 'gzip', 'compresslevel': 9})\n",
" covariates_df.index.to_series().to_csv(\"$[_output:bnn].sample_analyzed.txt\", sep='\\t', index=None, header=False)\n",
"\n",
"\n",
"bash: expand= \"$[ ]\", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, entrypoint = entrypoint\n",
" for i in $[_output] ; do \n",
Expand Down
8 changes: 4 additions & 4 deletions _sources/code/data_preprocessing/genotype/VCF_QC.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -687,10 +687,10 @@
"output: f\"{_input:nn}.bcftools_qc.vcf.gz\"\n",
"task: trunk_workers = 1, trunk_size=job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
"bash: container = container, expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', entrypoint=entrypoint\n",
" # Initial filtering based on depth and genotype quality for SNPs and INDELs\n",
" bcftools filter -S . -e \\\n",
" '(TYPE=\"SNP\" & (FMT/DP)<${DP_snp} & (FMT/GQ)<${GQ}) | \n",
" (TYPE=\"INDEL\" & (FMT/DP)<${DP_indel} & (FMT/GQ)<${GQ})' ${_input} | \\\n",
" # Initial filtering based on depth and genotype quality for SNPs and INDELs. Will set to missing genotypes that do not meet both conditions (DP>=DP_spns & GQ>=GQ), similarly for indels\n",
" bcftools filter -S . -i \\\n",
" '(TYPE=\"SNP\" & (FMT/DP)>=${DP_snp} & (FMT/GQ)>=${GQ}) | \n",
" (TYPE=\"INDEL\" & (FMT/DP)>=${DP_indel} & (FMT/GQ)>=${GQ})' ${_input} | \\\n",
" \n",
" # Further filtering to retain only variants that are PASS and have at least one non-reference allele \n",
" bcftools view -c1 | \\\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,7 @@
"parameter: overlap_ratio = 0.8\n",
"input: intron_count, annotation_gtf\n",
"output: f'{cwd}/{_input[1]:b}.exon_list', f'{cwd}/{_input[0]:b}.leafcutter.clusters_to_genes.txt'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output[0]:bn}' \n",
"python: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container, entrypoint=entrypoint\n",
" import pandas as pd\n",
" import qtl.annotation\n",
Expand Down Expand Up @@ -1040,7 +1041,7 @@
" }\n",
"\n",
" cat(\"LeafCutter: mapping clusters to genes\\n\")\n",
" intron_counts <- read.table(${_input[0]:r}, header=TRUE, check.names=FALSE, row.names=1)\n",
" intron_counts <- read.table(${_input[0]:r}, header=TRUE, check.names=FALSE, row.names=4)\n",
" intron_meta <- get_intron_meta(rownames(intron_counts))\n",
" exon_table <- read.table(${_output[0]:r}, header=TRUE, stringsAsFactors=FALSE)\n",
" if(!str_detect(intron_meta$chr[1],\"chr\")) {\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@
"parameter: TAD_list = path\n",
"parameter: phenotype_per_tad = 2 # This is the minimum number of epigenomics marker for a tadb to be considered having a functions.\n",
"input: phenoFile,TAD_list\n",
"output: f'{cwd}/{_input[0]}.{_input[1]}.{phenotype_per_tad}_pheno_per_region.region_list'\n",
"output: f'{cwd}/{_input[0]:b}.{_input[1]:b}.{phenotype_per_tad}_pheno_per_region.region_list'\n",
"R: expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout',container = container, entrypoint=entrypoint\n",
" library(tidyverse)\n",
"\n",
Expand Down Expand Up @@ -579,7 +579,7 @@
""
]
],
"version": "0.24.3"
"version": "0.24.1"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@
"parameter: num_factor = '60'\n",
"input: phenoFile\n",
"output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}' \n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}', cores = numThreads \n",
"R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n",
" library(\"flashier\")\n",
" library(\"tibble\")\n",
Expand All @@ -149,7 +149,7 @@
" ###uses \"svd\" algorithm\n",
" fit1 <- softImpute::softImpute(X_mis_C,rank = rank,lambda = lambda,type = \"svd\")\n",
" pheno_soft <- softImpute::complete(as.matrix(pheno_NAs),fit1)\n",
" min_sd <- min(apply(pheno_soft, 1, sd)) / 100\n",
" min_sd <- min(apply(pheno_soft, 1, sd))\n",
" pca_res <- irlba::irlba(pheno_soft, nv = ${num_factor})\n",
" pca_res <- list(pca_res$d, pca_res$u, pca_res$v)\n",
" names(pca_res) <- c('d', 'u', 'v')\n",
Expand Down Expand Up @@ -192,7 +192,7 @@
"parameter: null_check = False\n",
"input: phenoFile\n",
"output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}' \n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}', cores = numThreads \n",
"R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n",
" library(\"flashier\")\n",
" library(\"tibble\")\n",
Expand Down
38 changes: 38 additions & 0 deletions _sources/code/mnm_analysis/fSuSiE.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,44 @@
"## Generate phenotype data per TADB list\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7521db28-4613-4e4a-9fd5-eefe314af311",
"metadata": {},
"outputs": [],
"source": [
"/home/hs3163/GIT/fungen-xqtl-analysis/analysis/Wang_Columbia/susie_twas/ROSMAP_eQTL/TADB_enhanced_cis.coding_nochrX.bed"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0695c9e-504b-453f-8713-3dcc1ef24072",
"metadata": {},
"outputs": [],
"source": [
"sos run pipeline/phenotype_formatting.ipynb phenotype_annotate_by_tad \\\n",
" --TAD_list /home/hs3163/GIT/fungen-xqtl-analysis/analysis/Wang_Columbia/susie_twas/ROSMAP_eQTL/TADB_enhanced_cis.coding_nochrX.bed \\\n",
" --phenoFile \\\n",
" --phenotype_per_tad 2 \\\n",
" --container oras://ghcr.io/cumc/pecotmr_apptainer:latest"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73a4d65a-fdca-47a2-a0af-680439b15007",
"metadata": {},
"outputs": [],
"source": [
"sos run pipeline/phenotype_formatting.ipynb phenotype_annotate_by_tad \\\n",
" --TAD_list ~/GIT/fungen-xqtl-analysis/resource/extended_TADB.bed \\\n",
" --phenoFile /mnt/vast/hpc/csg/molecular_phenotype_calling/QTL_fine_mapping/h3k9ac_whole.bed.gz \\\n",
" --phenotype_per_tad 2 \\\n",
" --container oras://ghcr.io/cumc/pecotmr_apptainer:latest"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading

0 comments on commit 576f8c0

Please sign in to comment.