Update documentation

cumc · Mar 23, 2024 · 5fee4ae · 5fee4ae
1 parent 834bf72
commit 5fee4ae
Show file tree

Hide file tree

Showing 24 changed files with 1,850 additions and 1,368 deletions.
diff --git a/README.html b/README.html
@@ -253,7 +253,7 @@
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced Genome-wide Analysis</span></p>
 <ul class="nav bd-sidenav">
-<li class="toctree-l1"><a class="reference internal" href="code/mnm_analysis/rss_analysis.html">Fine-mapping with SuSiE RSS model</a></li>
+<li class="toctree-l1"><a class="reference internal" href="code/mnm_analysis/rss_analysis.html">High-dimensional regression with summary statistics</a></li>
 <li class="toctree-l1"><a class="reference internal" href="code/prototype_drafts/polyfun.html">Fine-mapping with PolyFun</a></li>
 <li class="toctree-l1"><a class="reference internal" href="code/prototype_drafts/MRAID_QTL.html">Mendelian Randomization using MRAID</a></li>
 </ul>

diff --git a/_sources/code/association_scan/TensorQTL/TensorQTL.ipynb b/_sources/code/association_scan/TensorQTL/TensorQTL.ipynb
diff --git a/_sources/code/data_preprocessing/phenotype/gene_annotation.ipynb b/_sources/code/data_preprocessing/phenotype/gene_annotation.ipynb
@@ -951,13 +951,44 @@
     "# Define the overlap ratio as the proportion of the cluster length that intersects with a gene, used to determine mapping to the gene.\n",
     "parameter: overlap_ratio = 0.8\n",
     "input: intron_count, annotation_gtf\n",
-    "output: f'{cwd}/{_input[1]:b}.exon_list', f'{cwd}/{_input[0]:b}.leafcutter.clusters_to_genes.txt'\n",
+    "output: f'{cwd}/{_input[0]:b}.exon_list', f'{cwd}/{_input[0]:b}.leafcutter.clusters_to_genes.txt'\n",
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output[0]:bn}'  \n",
     "python: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container, entrypoint=entrypoint\n",
     "    import pandas as pd\n",
     "    import qtl.annotation\n",
+    "\n",
+    "    gtf = ${_input[1]:r}\n",
+    "    #there is no \"gene_type\" when using the uncollapsed gtf. Replace \"gene_biotype\" with \"gene_type\" in the gtf and temporarily save for use in the annotation\n",
+    "    has_gene_type = True\n",
+    "    with open(gtf, 'r') as file:\n",
+    "        for line in file:\n",
+    "            if line[0] == '#':\n",
+    "                continue\n",
+    "\n",
+    "            row = line.strip().split('\\t')\n",
+    "            chrom = row[0]\n",
+    "            # source = row[1]\n",
+    "            annot_type = row[2]\n",
+    "\n",
+    "            if annot_type == \"gene\":\n",
+    "                if \"gene_type\" not in line:\n",
+    "                    has_gene_type = False\n",
+    "                break\n",
+    "\n",
+    "    if has_gene_type == False:\n",
+    "        \n",
+    "        with open(gtf, 'r') as file:\n",
+    "            file_contents = file.read()\n",
+    "        updated_contents = file_contents.replace(\"gene_biotype\", \"gene_type\")\n",
+    "        gtf = ${_input[1]:rn}.tmp${_input[1]:rx}\n",
+    "        print(gtf)\n",
+    "        with open(gtf, 'w') as file:\n",
+    "            file.write(updated_contents)\n",
+    "\n",
+    "\n",
     "    # Load data\n",
-    "    annot = qtl.annotation.Annotation(${_input[1]:r})\n",
+    "    #annot = qtl.annotation.Annotation(${_input[1]:r})\n",
+    "    annot = qtl.annotation.Annotation(gtf)\n",
     "    exon_df = pd.DataFrame([[g.chr, e.start_pos, e.end_pos, g.strand, g.id, g.name]\n",
     "                        for g in annot.genes for e in g.transcripts[0].exons],\n",
     "                       columns=['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name'])\n",
@@ -1092,10 +1123,16 @@
     "    import pandas as pd\n",
     "    import numpy as np\n",
     "    import qtl.io\n",
+    "    import os\n",
     "    from pathlib import Path\n",
     "\n",
     "    # Load data\n",
-    "    tss_df = qtl.io.gtf_to_tss_bed(${_input[1]:r})\n",
+    "    if os.path.exists(${_input[1]:rn}.tmp${_input[1]:rx}):\n",
+    "        gtf = ${_input[1]:rn}.tmp${_input[1]:rx}\n",
+    "    else:\n",
+    "        gtf = ${_input[1]:r}\n",
+    "    #tss_df = qtl.io.gtf_to_tss_bed(${_input[1]:r})\n",
+    "    tss_df = qtl.io.gtf_to_tss_bed(gtf)\n",
     "    bed_df = pd.read_csv(${_input[0]:ar}, sep='\\t', skiprows=0)\n",
     "    bed_df.columns.values[0] = \"#chr\" # Temporary\n",
     "    sample_participant_lookup = Path(\"${sample_participant_lookup:a}\")\n",
@@ -1149,6 +1186,9 @@
     "\n",
     "bash: expand= \"$[ ]\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container, entrypoint=entrypoint\n",
     "        stdout=$[_output[0]:n].stdout\n",
+    "\n",
+    "        rm $[_input[1]:rn].tmp$[_input[1]:rx]\n",
+    "\n",
     "        for i in $[_output[0]] ; do \n",
     "        echo \"output_info: $i \" >> $stdout;\n",
     "        echo \"output_size:\" `ls -lh $i | cut -f 5  -d  \" \"`   >> $stdout;\n",
@@ -1207,7 +1247,15 @@
     "    # change sample IDs to participant IDs\n",
     "    if sample_participant_lookup.is_file():\n",
     "        sample_participant_lookup_s = pd.read_csv(sample_participant_lookup, sep=\"\\t\", index_col=0, dtype={0:str,1:str})\n",
-    "        output.rename(columns=sample_participant_lookup_s.to_dict(), inplace=True)\n",
+    "\n",
+    "        column_mapping = dict(zip(sample_participant_lookup_s.index, sample_participant_lookup_s['participant_id']))\n",
+    "\n",
+    "        column_names = output.columns[4:]\n",
+    "        print(column_names)\n",
+    "        new_column_names = [column_mapping.get(col, col) for col in column_names]\n",
+    "        output.rename(columns=dict(zip(column_names, new_column_names)), inplace=True)\n",
+    "\n",
+    "        #output.rename(columns=sample_participant_lookup_s.to_dict(), inplace=True)\n",
     "\n",
     "    # Old code grouping by each gene\n",
     "    #bed_output = output.drop(\"gene_id\" , axis = 1)\n",