Skip to content

Commit

Permalink
Update documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
gaow committed Mar 23, 2024
1 parent 834bf72 commit 5fee4ae
Show file tree
Hide file tree
Showing 24 changed files with 1,850 additions and 1,368 deletions.
2 changes: 1 addition & 1 deletion README.html
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced Genome-wide Analysis</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="code/mnm_analysis/rss_analysis.html">Fine-mapping with SuSiE RSS model</a></li>
<li class="toctree-l1"><a class="reference internal" href="code/mnm_analysis/rss_analysis.html">High-dimensional regression with summary statistics</a></li>
<li class="toctree-l1"><a class="reference internal" href="code/prototype_drafts/polyfun.html">Fine-mapping with PolyFun</a></li>
<li class="toctree-l1"><a class="reference internal" href="code/prototype_drafts/MRAID_QTL.html">Mendelian Randomization using MRAID</a></li>
</ul>
Expand Down
648 changes: 355 additions & 293 deletions _sources/code/association_scan/TensorQTL/TensorQTL.ipynb

Large diffs are not rendered by default.

56 changes: 52 additions & 4 deletions _sources/code/data_preprocessing/phenotype/gene_annotation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -951,13 +951,44 @@
"# Define the overlap ratio as the proportion of the cluster length that intersects with a gene, used to determine mapping to the gene.\n",
"parameter: overlap_ratio = 0.8\n",
"input: intron_count, annotation_gtf\n",
"output: f'{cwd}/{_input[1]:b}.exon_list', f'{cwd}/{_input[0]:b}.leafcutter.clusters_to_genes.txt'\n",
"output: f'{cwd}/{_input[0]:b}.exon_list', f'{cwd}/{_input[0]:b}.leafcutter.clusters_to_genes.txt'\n",
"task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output[0]:bn}' \n",
"python: expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container, entrypoint=entrypoint\n",
" import pandas as pd\n",
" import qtl.annotation\n",
"\n",
" gtf = ${_input[1]:r}\n",
" #there is no \"gene_type\" when using the uncollapsed gtf. Replace \"gene_biotype\" with \"gene_type\" in the gtf and temporarily save for use in the annotation\n",
" has_gene_type = True\n",
" with open(gtf, 'r') as file:\n",
" for line in file:\n",
" if line[0] == '#':\n",
" continue\n",
"\n",
" row = line.strip().split('\\t')\n",
" chrom = row[0]\n",
" # source = row[1]\n",
" annot_type = row[2]\n",
"\n",
" if annot_type == \"gene\":\n",
" if \"gene_type\" not in line:\n",
" has_gene_type = False\n",
" break\n",
"\n",
" if has_gene_type == False:\n",
" \n",
" with open(gtf, 'r') as file:\n",
" file_contents = file.read()\n",
" updated_contents = file_contents.replace(\"gene_biotype\", \"gene_type\")\n",
" gtf = ${_input[1]:rn}.tmp${_input[1]:rx}\n",
" print(gtf)\n",
" with open(gtf, 'w') as file:\n",
" file.write(updated_contents)\n",
"\n",
"\n",
" # Load data\n",
" annot = qtl.annotation.Annotation(${_input[1]:r})\n",
" #annot = qtl.annotation.Annotation(${_input[1]:r})\n",
" annot = qtl.annotation.Annotation(gtf)\n",
" exon_df = pd.DataFrame([[g.chr, e.start_pos, e.end_pos, g.strand, g.id, g.name]\n",
" for g in annot.genes for e in g.transcripts[0].exons],\n",
" columns=['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name'])\n",
Expand Down Expand Up @@ -1092,10 +1123,16 @@
" import pandas as pd\n",
" import numpy as np\n",
" import qtl.io\n",
" import os\n",
" from pathlib import Path\n",
"\n",
" # Load data\n",
" tss_df = qtl.io.gtf_to_tss_bed(${_input[1]:r})\n",
" if os.path.exists(${_input[1]:rn}.tmp${_input[1]:rx}):\n",
" gtf = ${_input[1]:rn}.tmp${_input[1]:rx}\n",
" else:\n",
" gtf = ${_input[1]:r}\n",
" #tss_df = qtl.io.gtf_to_tss_bed(${_input[1]:r})\n",
" tss_df = qtl.io.gtf_to_tss_bed(gtf)\n",
" bed_df = pd.read_csv(${_input[0]:ar}, sep='\\t', skiprows=0)\n",
" bed_df.columns.values[0] = \"#chr\" # Temporary\n",
" sample_participant_lookup = Path(\"${sample_participant_lookup:a}\")\n",
Expand Down Expand Up @@ -1149,6 +1186,9 @@
"\n",
"bash: expand= \"$[ ]\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container, entrypoint=entrypoint\n",
" stdout=$[_output[0]:n].stdout\n",
"\n",
" rm $[_input[1]:rn].tmp$[_input[1]:rx]\n",
"\n",
" for i in $[_output[0]] ; do \n",
" echo \"output_info: $i \" >> $stdout;\n",
" echo \"output_size:\" `ls -lh $i | cut -f 5 -d \" \"` >> $stdout;\n",
Expand Down Expand Up @@ -1207,7 +1247,15 @@
" # change sample IDs to participant IDs\n",
" if sample_participant_lookup.is_file():\n",
" sample_participant_lookup_s = pd.read_csv(sample_participant_lookup, sep=\"\\t\", index_col=0, dtype={0:str,1:str})\n",
" output.rename(columns=sample_participant_lookup_s.to_dict(), inplace=True)\n",
"\n",
" column_mapping = dict(zip(sample_participant_lookup_s.index, sample_participant_lookup_s['participant_id']))\n",
"\n",
" column_names = output.columns[4:]\n",
" print(column_names)\n",
" new_column_names = [column_mapping.get(col, col) for col in column_names]\n",
" output.rename(columns=dict(zip(column_names, new_column_names)), inplace=True)\n",
"\n",
" #output.rename(columns=sample_participant_lookup_s.to_dict(), inplace=True)\n",
"\n",
" # Old code grouping by each gene\n",
" #bed_output = output.drop(\"gene_id\" , axis = 1)\n",
Expand Down
Loading

0 comments on commit 5fee4ae

Please sign in to comment.