Update documentation

cumc · Mar 11, 2024 · 576f8c0 · 576f8c0
1 parent a211e7a
commit 576f8c0
Show file tree

Hide file tree

Showing 19 changed files with 1,568 additions and 1,728 deletions.
diff --git a/_sources/code/association_scan/TensorQTL/TensorQTL.ipynb b/_sources/code/association_scan/TensorQTL/TensorQTL.ipynb
@@ -732,7 +732,7 @@
     "        cis_list = pd.read_csv(\"$[customized_cis_windows:a]\", comment=\"#\", header=None, names=[\"chr\",\"start\",\"end\",phenotype_id], sep=\"\\t\")\n",
     "        phenotype_pos_df_reset = phenotype_pos_df.reset_index() #move the phenotype id index to a new column of the dataframe\n",
     "        phenotype_pos_df = phenotype_pos_df_reset.merge(cis_list, left_on = [\"chr\",phenotype_id],right_on = [cis_list.columns[0],cis_list.columns[3]])#in some cases (gene expression for eQTLs) the phenotype_id may be in the cis_list file\n",
-    "        if len(phenotype_df.index) - phenotype_pos_df_reset[~phenotype_pos_df_reset['ID'].isin(cis_list['ID'])].shape[0]!= len(phenotype_pos_df.index):\n",
+    "        if len(phenotype_df.index) - phenotype_pos_df_reset[~phenotype_pos_df_reset[phenotype_id].isin(cis_list[phenotype_id])].shape[0]!= len(phenotype_pos_df.index):\n",
     "            raise ValueError(\"cannot uniquely match all the phentoype data in the input to the customized cis windows provided\")\n",
     "        phenotype_pos_df = phenotype_pos_df.set_index(phenotype_id)[[\"chr\",\"start\",\"end\"]] # The final phenotype_pos_df will have three columns(chr, start, end) and index is the phenotype ID\n",
     "        window = 0 # In the updated tensorQTL, by default if there is a customized cis window, the actual cis window will be start - window & end + window, so it is necessary to change the window parameter to 0\n",
@@ -747,8 +747,8 @@
     "    variant_df = pr.bim.set_index('snp')[['chrom', 'pos','a0','a1']]\n",
     "\n",
     "    ## Retaining only traits in cis_list\n",
-    "    if phenotype_pos_df_reset[~phenotype_pos_df_reset['ID'].isin(cis_list['ID'])].shape[0] != 0:\n",
-    "        phenotype_df = phenotype_df.loc[phenotype_df.index.isin(cis_list['ID'])]\n",
+    "    if phenotype_pos_df_reset[~phenotype_pos_df_reset[phenotype_id].isin(cis_list[phenotype_id])].shape[0] != 0:\n",
+    "        phenotype_df = phenotype_df.loc[phenotype_df.index.isin(cis_list[phenotype_id])]\n",
     "\n",
     "    ## Retaining only common samples\n",
     "    phenotype_df = phenotype_df[np.intersect1d(phenotype_df.columns, covariates_df.index)]\n",
@@ -789,7 +789,7 @@
     "    trans_df.columns.values[4]  = \"se\"\n",
     "    trans_df = trans_df.merge(variant_df,left_on = \"variant_id\", right_index=True)\n",
     "    trans_df.to_csv(\"$[_output]\", sep='\\t',index = None, compression={'method': 'gzip', 'compresslevel': 9})\n",
-    "    covariates_df.index.to_series().to_csv(\"$[_output:bnn].sample_analyzed.txt\", sep='\\t', index=None, header=False)\n",
+    "\n",
     "\n",
     "bash: expand= \"$[ ]\", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, entrypoint = entrypoint\n",
     "        for i in $[_output] ; do \n",

diff --git a/_sources/code/data_preprocessing/genotype/VCF_QC.ipynb b/_sources/code/data_preprocessing/genotype/VCF_QC.ipynb
@@ -687,10 +687,10 @@
     "output: f\"{_input:nn}.bcftools_qc.vcf.gz\"\n",
     "task: trunk_workers = 1, trunk_size=job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
     "bash: container = container, expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', entrypoint=entrypoint\n",
-    "    # Initial filtering based on depth and genotype quality for SNPs and INDELs\n",
-    "    bcftools filter -S . -e \\\n",
-    "    '(TYPE=\"SNP\" & (FMT/DP)<${DP_snp} & (FMT/GQ)<${GQ}) | \n",
-    "     (TYPE=\"INDEL\" & (FMT/DP)<${DP_indel} & (FMT/GQ)<${GQ})' ${_input} | \\\n",
+    "    # Initial filtering based on depth and genotype quality for SNPs and INDELs. Will set to missing genotypes that do not meet both conditions (DP>=DP_spns & GQ>=GQ), similarly for indels\n",
+    "    bcftools filter -S . -i \\\n",
+    "    '(TYPE=\"SNP\" & (FMT/DP)>=${DP_snp} & (FMT/GQ)>=${GQ}) | \n",
+    "     (TYPE=\"INDEL\" & (FMT/DP)>=${DP_indel} & (FMT/GQ)>=${GQ})' ${_input} | \\\n",
     "     \n",
     "    # Further filtering to retain only variants that are PASS and have at least one non-reference allele \n",
     "    bcftools view -c1 | \\\n",

diff --git a/_sources/code/data_preprocessing/phenotype/gene_annotation.ipynb b/_sources/code/data_preprocessing/phenotype/gene_annotation.ipynb
@@ -952,6 +952,7 @@
     "parameter: overlap_ratio = 0.8\n",
     "input: intron_count, annotation_gtf\n",
     "output: f'{cwd}/{_input[1]:b}.exon_list', f'{cwd}/{_input[0]:b}.leafcutter.clusters_to_genes.txt'\n",
+    "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output[0]:bn}'  \n",
     "python: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container, entrypoint=entrypoint\n",
     "    import pandas as pd\n",
     "    import qtl.annotation\n",
@@ -1040,7 +1041,7 @@
     "    }\n",
     "\n",
     "    cat(\"LeafCutter: mapping clusters to genes\\n\")\n",
-    "    intron_counts <- read.table(${_input[0]:r}, header=TRUE, check.names=FALSE, row.names=1)\n",
+    "    intron_counts <- read.table(${_input[0]:r}, header=TRUE, check.names=FALSE, row.names=4)\n",
     "    intron_meta <- get_intron_meta(rownames(intron_counts))\n",
     "    exon_table <- read.table(${_output[0]:r}, header=TRUE, stringsAsFactors=FALSE)\n",
     "    if(!str_detect(intron_meta$chr[1],\"chr\")) {\n",

diff --git a/_sources/code/data_preprocessing/phenotype/phenotype_formatting.ipynb b/_sources/code/data_preprocessing/phenotype/phenotype_formatting.ipynb
@@ -375,7 +375,7 @@
     "parameter: TAD_list = path\n",
     "parameter: phenotype_per_tad = 2 # This is the minimum number of epigenomics marker for a tadb to be considered having a functions.\n",
     "input: phenoFile,TAD_list\n",
-    "output: f'{cwd}/{_input[0]}.{_input[1]}.{phenotype_per_tad}_pheno_per_region.region_list'\n",
+    "output: f'{cwd}/{_input[0]:b}.{_input[1]:b}.{phenotype_per_tad}_pheno_per_region.region_list'\n",
     "R: expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout',container = container, entrypoint=entrypoint\n",
     "    library(tidyverse)\n",
     "\n",
@@ -579,7 +579,7 @@
      ""
     ]
    ],
-   "version": "0.24.3"
+   "version": "0.24.1"
   }
  },
  "nbformat": 4,

diff --git a/_sources/code/data_preprocessing/phenotype/phenotype_imputation.ipynb b/_sources/code/data_preprocessing/phenotype/phenotype_imputation.ipynb
@@ -125,7 +125,7 @@
     "parameter: num_factor = '60'\n",
     "input: phenoFile\n",
     "output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
-    "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}'  \n",
+    "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}', cores = numThreads  \n",
     "R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n",
     "   library(\"flashier\")\n",
     "   library(\"tibble\")\n",
@@ -149,7 +149,7 @@
     "    ###uses \"svd\" algorithm\n",
     "    fit1 <- softImpute::softImpute(X_mis_C,rank = rank,lambda = lambda,type = \"svd\")\n",
     "    pheno_soft <- softImpute::complete(as.matrix(pheno_NAs),fit1)\n",
-    "    min_sd <- min(apply(pheno_soft, 1, sd)) / 100\n",
+    "    min_sd <- min(apply(pheno_soft, 1, sd))\n",
     "    pca_res <- irlba::irlba(pheno_soft, nv = ${num_factor})\n",
     "    pca_res <- list(pca_res$d, pca_res$u, pca_res$v)\n",
     "    names(pca_res) <- c('d', 'u', 'v')\n",
@@ -192,7 +192,7 @@
     "parameter: null_check = False\n",
     "input: phenoFile\n",
     "output: f'{cwd:a}/{_input:bn}.imputed.bed.gz'\n",
-    "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}'  \n",
+    "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}', cores = numThreads  \n",
     "R: expand= \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n",
     "   library(\"flashier\")\n",
     "   library(\"tibble\")\n",

diff --git a/_sources/code/mnm_analysis/fSuSiE.ipynb b/_sources/code/mnm_analysis/fSuSiE.ipynb
@@ -18,6 +18,44 @@
     "## Generate phenotype data per TADB list\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7521db28-4613-4e4a-9fd5-eefe314af311",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "/home/hs3163/GIT/fungen-xqtl-analysis/analysis/Wang_Columbia/susie_twas/ROSMAP_eQTL/TADB_enhanced_cis.coding_nochrX.bed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0695c9e-504b-453f-8713-3dcc1ef24072",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sos run pipeline/phenotype_formatting.ipynb phenotype_annotate_by_tad \\\n",
+    "    --TAD_list /home/hs3163/GIT/fungen-xqtl-analysis/analysis/Wang_Columbia/susie_twas/ROSMAP_eQTL/TADB_enhanced_cis.coding_nochrX.bed  \\\n",
+    "    --phenoFile  \\\n",
+    "    --phenotype_per_tad 2 \\\n",
+    "    --container oras://ghcr.io/cumc/pecotmr_apptainer:latest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73a4d65a-fdca-47a2-a0af-680439b15007",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sos run pipeline/phenotype_formatting.ipynb phenotype_annotate_by_tad \\\n",
+    "    --TAD_list ~/GIT/fungen-xqtl-analysis/resource/extended_TADB.bed  \\\n",
+    "    --phenoFile /mnt/vast/hpc/csg/molecular_phenotype_calling/QTL_fine_mapping/h3k9ac_whole.bed.gz \\\n",
+    "    --phenotype_per_tad 2 \\\n",
+    "    --container oras://ghcr.io/cumc/pecotmr_apptainer:latest"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,