From 9534bc8b0e77631b13a7b7062cf5bc1d4336f60d Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Wed, 5 Jul 2023 21:46:27 +0100 Subject: [PATCH 1/5] reads per wlel nb --- docs/ampseeker-results/_toc.yml | 6 +- workflow/Snakefile | 1 + workflow/envs/AmpSeeker-python.yaml | 3 +- workflow/notebooks/reads-per-well.ipynb | 206 ++++++++++++++++++++++++ workflow/rules/common.smk | 4 + workflow/rules/qc.smk | 22 ++- 6 files changed, 239 insertions(+), 3 deletions(-) create mode 100644 workflow/notebooks/reads-per-well.ipynb diff --git a/docs/ampseeker-results/_toc.yml b/docs/ampseeker-results/_toc.yml index 081457d..4322954 100644 --- a/docs/ampseeker-results/_toc.yml +++ b/docs/ampseeker-results/_toc.yml @@ -4,10 +4,14 @@ format: jb-book root: intro parts: +- caption: QC + chapters: + - file: notebooks/read-quality + - file: notebooks/reads-per-well + - file: notebooks/IGV-explore - caption: Results chapters: - file: notebooks/sample-map - - file: notebooks/IGV-explore - file: notebooks/coverage - file: notebooks/principal-component-analysis - file: notebooks/allele-frequencies diff --git a/workflow/Snakefile b/workflow/Snakefile index ac53000..8da637d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -6,6 +6,7 @@ import numpy as np configfile:"config/config.yaml" dataset = config['dataset'] metadata = pd.read_csv(config['metadata'], sep="\t") +plate_info = np.isin(['plate', 'well_letter', 'well_number'], metadata.columns).all() samples = metadata['sampleID'] import os diff --git a/workflow/envs/AmpSeeker-python.yaml b/workflow/envs/AmpSeeker-python.yaml index b2bfd78..c222ef8 100644 --- a/workflow/envs/AmpSeeker-python.yaml +++ b/workflow/envs/AmpSeeker-python.yaml @@ -15,4 +15,5 @@ dependencies: - pip: - igv_notebook - papermill - - scikit-allel \ No newline at end of file + - scikit-allel + - pysam \ No newline at end of file diff --git a/workflow/notebooks/reads-per-well.ipynb b/workflow/notebooks/reads-per-well.ipynb new file mode 100644 index 0000000..0baaa22 --- /dev/null +++ b/workflow/notebooks/reads-per-well.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3757875d", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import plotly.express as px\n", + "import numpy as np\n", + "import pysam\n", + "\n", + "def count_mapped_reads(bam_file):\n", + " mapped_reads = 0\n", + " # Open the BAM file\n", + " with pysam.AlignmentFile(bam_file, \"rb\") as bam:\n", + " # Iterate over alignments\n", + " for alignment in bam:\n", + " # Check if the alignment is mapped\n", + " if not alignment.is_unmapped:\n", + " mapped_reads += 1\n", + "\n", + " return mapped_reads" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "045bf668", + "metadata": { + "tags": [ + "remove-input", + "parameters" + ] + }, + "outputs": [], + "source": [ + "metadata_path = \"../../config/metadata.tsv\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1470a04d", + "metadata": {}, + "source": [ + "# Plate statistics\n", + "\n", + "In this notebook, we explore how sample-level statistics look when we map out the sample by their position on a plate. \n", + "First, lets explore a histogram of the overall reads per sample. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63ad3a24", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "def extract_percentage(string):\n", + " import re\n", + " # Pattern to match a percentage value\n", + " pattern = r'(\\d+(?:\\.\\d+)?)%'\n", + " # Search for the pattern in the string\n", + " match = re.search(pattern, string)\n", + " if match:\n", + " percentage = float(match.group(1))\n", + " return percentage\n", + " return None\n", + "\n", + "df_samples = pd.read_csv(metadata_path, sep=\"\\t\")\n", + "demux_df = pd.read_csv(\"resources/reads/Reports/Demultiplex_Stats.csv\", sep=\",\")\n", + "\n", + "# count mapped reads in bams \n", + "mapped_reads = []\n", + "freq_mapped = []\n", + "for sampleID in df_samples['sampleID']:\n", + " # Call the count_mapped_reads function\n", + " mapped_reads_count = count_mapped_reads(f\"results/alignments/{sampleID}.bam\")\n", + " mapped_reads.append(mapped_reads_count)\n", + " \n", + " df = pd.read_csv(f\"results/alignments/bamStats/{sampleID}.flagstat\")\n", + " freq_mapped.append(extract_percentage(df.iloc[6, 0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bca9e3ca", + "metadata": { + "scrolled": false, + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "df_samples = df_samples.assign(mapped_reads=mapped_reads, freq_mapped=freq_mapped)\n", + "\n", + "fig = px.histogram(df_samples, x='mapped_reads', nbins=60, width=600, height=400)\n", + "fig" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "86d2450a", + "metadata": {}, + "source": [ + "### Mapped reads per well\n", + "\n", + "The below plot displays the samples in their 96 well plate format, showing the number of mapped reads assigned to each sample. Extra data on each sample is available by hovering over the wells. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cda6b689", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "def plot_96well_plate(df_samples, color_var='mapped_reads', title='Plate A - Number of mapped reads'):\n", + " fig = px.scatter(df_samples[::-1], \n", + " y='well_letter', \n", + " x='well_number',\n", + " color=color_var, \n", + " hover_data=df_samples.columns, \n", + " template='plotly_white')\n", + " fig.update_traces(marker_size=40)\n", + " fig.update_layout(xaxis = dict(\n", + " side='top',\n", + " tickmode = 'linear',\n", + " tick0 = 0,\n", + " dtick = 1), \n", + " title=title)\n", + " return fig\n", + "\n", + "for plate in df_samples.plate.unique():\n", + " df = df_samples.query(f\"plate == @plate\")\n", + " fig = plot_96well_plate(df, color_var='mapped_reads', title=f'Plate {plate} - Number of mapped reads')\n", + " fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34ba1bfc", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "for plate in df_samples.plate.unique():\n", + " df = df_samples.query(f\"plate == @plate\")\n", + " fig = plot_96well_plate(df, color_var='freq_mapped', title=f'Plate {plate} - % of reads that align to reference')\n", + " fig.show() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83472e84", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "AmpSeq_python", + "language": "python", + "name": "ampseq_python" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b4a1adb..0afb9bc 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -36,6 +36,10 @@ def AmpSeekerOutputs(wildcards): if config['bcl-convert']: inputs.extend(expand("results/reads/{sample}_{n}.fastq.gz", sample=samples, n=[1,2])) inputs.extend(["results/index-read-qc/I1.html", "results/index-read-qc/I2.html"]) + + if plate_info: + inputs.extend(["results/notebooks/reads-per-well.ipynb", + "docs/ampseeker-results/notebooks/reads-per-well.ipynb"]) if large_sample_size: inputs.extend(expand("results/vcfs/{dataset}.complete.merge_vcfs", dataset=config['dataset'])) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 89cd1c6..39ebc08 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -109,4 +109,24 @@ rule vcfStats: shell: """ bcftools stats {input.vcf} > {output.stats} 2> {log} - """ \ No newline at end of file + """ + +rule reads_per_well: + input: + nb = f"{workflow.basedir}/notebooks/reads-per-well.ipynb", + kernel = "results/.kernel.set", + bam = expand("results/alignments/{sample}.bam", sample=samples), + bai = expand("results/alignments/{sample}.bam.bai", sample=samples), + metadata = config["metadata"], + output: + nb = "results/notebooks/reads-per-well.ipynb", + docs_nb = "docs/ampseeker-results/notebooks/reads-per-well.ipynb" + conda: + "../envs/AmpSeeker-python.yaml" + log: + "logs/notebooks/reads-per-well.log" + shell: + """ + papermill {input.nb} {output.nb} -k AmpSeq_python -p metadata_path {input.metadata} 2> {log} + cp {output.nb} {output.docs_nb} 2>> {log} + """ From 2eae06ecefefd815d3b3a93150ff5883b38066fd Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Thu, 6 Jul 2023 17:18:16 +0100 Subject: [PATCH 2/5] fixes --- workflow/envs/AmpSeeker-qc.yaml | 7 +++-- workflow/notebooks/reads-per-well.ipynb | 1 - workflow/rules/bcl-convert.smk | 37 +++++++------------------ workflow/rules/common.smk | 3 +- workflow/rules/qc.smk | 9 +++--- 5 files changed, 20 insertions(+), 37 deletions(-) diff --git a/workflow/envs/AmpSeeker-qc.yaml b/workflow/envs/AmpSeeker-qc.yaml index 31d2d3c..9cbb402 100644 --- a/workflow/envs/AmpSeeker-qc.yaml +++ b/workflow/envs/AmpSeeker-qc.yaml @@ -1,7 +1,8 @@ name: AmpSeeker-qc channels: - bioconda - - dranew + - conda-forge + - anaconda dependencies: - - fastqc - - bcl2fastq \ No newline at end of file + - python=3.9 + - fastqc \ No newline at end of file diff --git a/workflow/notebooks/reads-per-well.ipynb b/workflow/notebooks/reads-per-well.ipynb index 0baaa22..3dd4d94 100644 --- a/workflow/notebooks/reads-per-well.ipynb +++ b/workflow/notebooks/reads-per-well.ipynb @@ -79,7 +79,6 @@ " return None\n", "\n", "df_samples = pd.read_csv(metadata_path, sep=\"\\t\")\n", - "demux_df = pd.read_csv(\"resources/reads/Reports/Demultiplex_Stats.csv\", sep=\",\")\n", "\n", "# count mapped reads in bams \n", "mapped_reads = []\n", diff --git a/workflow/rules/bcl-convert.smk b/workflow/rules/bcl-convert.smk index 8e18b76..4f3ec64 100644 --- a/workflow/rules/bcl-convert.smk +++ b/workflow/rules/bcl-convert.smk @@ -15,42 +15,25 @@ rule bcl_convert: rule rename_fastq: """ - If users demultiplex from BCL + If users demultiplex from BCL than rename, otherwise symlink to results """ input: reads = "resources/reads/", - fastq_list = "resources/reads/Reports/fastq_list.csv" + read_dir = rules.bcl_convert.output, + fastq_list = "resources/reads/Reports/fastq_list.csv", output: - output_reads = expand("results/reads/{sample}_{n}.fastq.gz", n=[1,2], sample=samples) + output_reads = expand("resources/reads/{sample}_{n}.fastq.gz", n=[1,2], sample=samples) log: "logs/rename_fastq.log" conda: "../envs/AmpSeeker-cli.yaml" - shell: - """ - while IFS="," read -r _ sample_id _ _ read1 read2; do - echo renaming $read1 and $read2 to ${{sample_id}}_1.fastq.gz and ${{sample_id}}_2.fastq.gz 2>> {log} - mv $read1 results/reads/${{sample_id}}_1.fastq.gz && - mv $read2 results/reads/${{sample_id}}_2.fastq.gz - done < {input.fastq_list} 2> {log} - """ - -rule symlink_fastq: - """ - If reads are provided by user and bcl-convert not used - """ - input: - input_reads = "resources/reads/{sample}_{n}.fastq.gz" - output: - output_reads = "results/reads/{sample}_{n}.fastq.gz" - log: - "logs/symlink_fastq/{sample}_{n}.log" - conda: - "../envs/AmpSeeker-cli.yaml" params: wd = wkdir shell: """ - echo {params.wd}/{output.output_reads} 2>> {log} - ln -sf {params.wd}/{input.input_reads} {params.wd}/{output.output_reads} 2>> {log} - """ \ No newline at end of file + while IFS="," read -r _ sample_id _ _ read1 read2; do + echo renaming $read1 and $read2 to ${{sample_id}}_1.fastq.gz and ${{sample_id}}_2.fastq.gz + mv $read1 resources/reads/${{sample_id}}_1.fastq.gz + mv $read2 resources/reads/${{sample_id}}_2.fastq.gz + done < {input.fastq_list} + """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 0afb9bc..e126865 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -34,9 +34,8 @@ rule set_kernel: def AmpSeekerOutputs(wildcards): inputs = [] if config['bcl-convert']: - inputs.extend(expand("results/reads/{sample}_{n}.fastq.gz", sample=samples, n=[1,2])) inputs.extend(["results/index-read-qc/I1.html", "results/index-read-qc/I2.html"]) - + if plate_info: inputs.extend(["results/notebooks/reads-per-well.ipynb", "docs/ampseeker-results/notebooks/reads-per-well.ipynb"]) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 39ebc08..58badea 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -21,11 +21,12 @@ rule index_read_fastqc: rule fastp: input: - sample=["results/reads/{sample}_1.fastq.gz", "results/reads/{sample}_2.fastq.gz"] + sample=["resources/reads/{sample}_1.fastq.gz", "resources/reads/{sample}_2.fastq.gz"] output: trimmed=["results/trimmed-reads/{sample}_1.fastq.gz", "results/trimmed-reads/{sample}_2.fastq.gz"], html="results/fastp_reports/{sample}.html", json="results/fastp_reports/{sample}.json", + logs="logs/fastp/{sample}.log" log: "logs/fastp/{sample}.log" threads: 4 @@ -34,8 +35,7 @@ rule fastp: rule multiQC: input: - expand("results/fastp_reports/{sample}.html", sample=samples), - expand("results/fastp_reports/{sample}.json", sample=samples), + expand("logs/fastp/{sample}.log", sample=samples), expand("results/alignments/bamStats/{sample}.flagstat", sample=samples), expand("results/coverage/{sample}.per-base.bed.gz", sample=samples), expand("results/vcfs/stats/{dataset}.merged.vcf.txt", dataset=dataset) @@ -43,7 +43,7 @@ rule multiQC: "results/multiqc/multiqc_report.html" log: "logs/multiqc/multiqc.log" - wrapper: + wrapper: "v1.25.0/bio/multiqc" @@ -117,6 +117,7 @@ rule reads_per_well: kernel = "results/.kernel.set", bam = expand("results/alignments/{sample}.bam", sample=samples), bai = expand("results/alignments/{sample}.bam.bai", sample=samples), + stats = expand("results/alignments/bamStats/{sample}.flagstat", sample=samples), metadata = config["metadata"], output: nb = "results/notebooks/reads-per-well.ipynb", From 65ac7ba49d0f7771021559af3aa03a02b9e827d2 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Thu, 6 Jul 2023 20:21:35 +0100 Subject: [PATCH 3/5] read-qual --- config/config.yaml | 4 +- workflow/Snakefile | 1 + workflow/notebooks/read-quality.ipynb | 121 ++++++++++++++++++++++++++ workflow/rules/bcl-convert.smk | 5 ++ workflow/rules/common.smk | 17 +++- workflow/rules/jupyter-book.smk | 2 + workflow/rules/qc-notebooks.smk | 44 ++++++++++ workflow/rules/qc.smk | 25 +----- 8 files changed, 193 insertions(+), 26 deletions(-) create mode 100644 workflow/notebooks/read-quality.ipynb create mode 100644 workflow/rules/qc-notebooks.smk diff --git a/config/config.yaml b/config/config.yaml index eaf69d4..38bb208 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -5,7 +5,7 @@ dataset: gaard-sanger metadata: config/metadata.tsv # Directory of Illumina Miseq Run -illumina-dir: resources/230530_M02853_0061_000000000-L23TH/ +illumina-dir: resources/230629_M05658_0009_000000000-DL7P9/ # Specify whether reference provided is amplicon or wholegenome sequence data # Genome fasta reference files @@ -17,7 +17,7 @@ reference-gff3: resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.1 targets: config/AgamDao.bed # Specify whether to convert bcl files to fastq -bcl-convert: False +bcl-convert: True # Specify whether to run quality-control analyses quality-control: diff --git a/workflow/Snakefile b/workflow/Snakefile index 8da637d..5b40857 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -17,6 +17,7 @@ include: "rules/utilities.smk" include: "rules/bcl-convert.smk" include: "rules/qc.smk" include: "rules/alignment-variantcalling.smk" +include: "rules/qc-notebooks.smk" include: "rules/analysis.smk" include: "rules/jupyter-book.smk" diff --git a/workflow/notebooks/read-quality.ipynb b/workflow/notebooks/read-quality.ipynb new file mode 100644 index 0000000..f17fcc7 --- /dev/null +++ b/workflow/notebooks/read-quality.ipynb @@ -0,0 +1,121 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "parameters", + "remove-input" + ] + }, + "outputs": [], + "source": [ + "metadata_path = '../../config/metadata.tsv'\n", + "index_read_qc = True\n", + "wkdir = \"\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read quality\n", + "\n", + "In this notebook, we link to the quality reports from MultiQC, index read QC and per sample from FastQC and FastP. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "# load panel metadata\n", + "import pandas as pd\n", + "metadata = pd.read_csv(metadata_path, sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "from IPython.display import display, Markdown\n", + "display(Markdown('## MultiQC'))\n", + "display(Markdown('MultiQC is a tool which integrates information from various tools in the workflow (Currently, it is not quite configured correctly)'))\n", + "display(Markdown(f'MultiQC report'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "if index_read_qc:\n", + " display(Markdown('## Index read QC'))\n", + " display(Markdown('Index reads with average quality score below 30 for a given run may be unreliable and cause demultiplexing errors.'))\n", + " display(Markdown(f'Index 1 FASTQC report'))\n", + " display(Markdown(f'Index 2 FASTQC report'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false, + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "display(Markdown('## Sample read QC'))\n", + "for sample in metadata.sampleID:\n", + " display(Markdown(f'{sample} fastp report'))" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "vscode": { + "interpreter": { + "hash": "ce681de973941d5edd9bd94c9a2926b7fe65e17e578a68317f38265a230b8ca7" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/workflow/rules/bcl-convert.smk b/workflow/rules/bcl-convert.smk index 4f3ec64..6d7531f 100644 --- a/workflow/rules/bcl-convert.smk +++ b/workflow/rules/bcl-convert.smk @@ -32,6 +32,11 @@ rule rename_fastq: shell: """ while IFS="," read -r _ sample_id _ _ read1 read2; do + + if [[ $sample_id == "RGSM" ]]; then + continue + fi + echo renaming $read1 and $read2 to ${{sample_id}}_1.fastq.gz and ${{sample_id}}_2.fastq.gz mv $read1 resources/reads/${{sample_id}}_1.fastq.gz mv $read2 resources/reads/${{sample_id}}_2.fastq.gz diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e126865..324c9db 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -33,6 +33,18 @@ rule set_kernel: def AmpSeekerOutputs(wildcards): inputs = [] + + inputs.extend( + expand( + [ + "results/alignments/{sample}.bam", + "results/alignments/{sample}.bam.bai", + ], sample=samples, + ), + ) + + + if config['bcl-convert']: inputs.extend(["results/index-read-qc/I1.html", "results/index-read-qc/I2.html"]) @@ -61,10 +73,13 @@ def AmpSeekerOutputs(wildcards): expand( [ "results/fastp_reports/{sample}.html", + "results/notebooks/read-quality.ipynb", + "docs/ampseeker-results/notebooks/read-quality.ipynb" ], sample=samples, ) - ) + ) + if config['quality-control']['qualimap']: inputs.extend( diff --git a/workflow/rules/jupyter-book.smk b/workflow/rules/jupyter-book.smk index aec5219..be603d4 100644 --- a/workflow/rules/jupyter-book.smk +++ b/workflow/rules/jupyter-book.smk @@ -6,6 +6,8 @@ rule jupyterbook: pca = "docs/ampseeker-results/notebooks/principal-component-analysis.ipynb" if config["analysis"]["pca"] else [], af = "docs/ampseeker-results/notebooks/allele-frequencies.ipynb" if config["analysis"]["allele-frequencies"] else [], sample_map = "docs/ampseeker-results/notebooks/sample-map.ipynb" if config["analysis"]["sample-map"] else [], + read_quality = "docs/ampseeker-results/notebooks/read-quality.ipynb" if config["quality-control"]["fastp"] else [], + reads_per_well = "docs/ampseeker-results/notebooks/reads-per-well.ipynb" if plate_info else [], output: directory("results/ampseeker-results/_build/html/"), home_page = "results/ampseeker-results/_build/html/index.html" diff --git a/workflow/rules/qc-notebooks.smk b/workflow/rules/qc-notebooks.smk new file mode 100644 index 0000000..c87a501 --- /dev/null +++ b/workflow/rules/qc-notebooks.smk @@ -0,0 +1,44 @@ + +rule reads_per_well: + input: + nb = f"{workflow.basedir}/notebooks/reads-per-well.ipynb", + kernel = "results/.kernel.set", + bam = expand("results/alignments/{sample}.bam", sample=samples), + bai = expand("results/alignments/{sample}.bam.bai", sample=samples), + stats = expand("results/alignments/bamStats/{sample}.flagstat", sample=samples), + metadata = config["metadata"], + output: + nb = "results/notebooks/reads-per-well.ipynb", + docs_nb = "docs/ampseeker-results/notebooks/reads-per-well.ipynb" + conda: + "../envs/AmpSeeker-python.yaml" + log: + "logs/notebooks/reads-per-well.log" + shell: + """ + papermill {input.nb} {output.nb} -k AmpSeq_python -p metadata_path {input.metadata} 2> {log} + cp {output.nb} {output.docs_nb} 2>> {log} + """ + +rule read_qc: + input: + nb = f"{workflow.basedir}/notebooks/read-quality.ipynb", + kernel = "results/.kernel.set", + fastp = expand("results/fastp_reports/{sample}.json", sample=samples), + index_qc = rules.index_read_fastqc.output, + metadata = config["metadata"], + output: + nb = "results/notebooks/read-quality.ipynb", + docs_nb = "docs/ampseeker-results/notebooks/read-quality.ipynb" + conda: + "../envs/AmpSeeker-python.yaml" + log: + "logs/notebooks/reads-quality.log" + params: + index_qc = config['bcl-convert'], + wd = wkdir + shell: + """ + papermill {input.nb} {output.nb} -k AmpSeq_python -p metadata_path {input.metadata} -p index_read_qc {params.index_qc} -p wkdir {params.wd} 2> {log} + cp {output.nb} {output.docs_nb} 2>> {log} + """ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 58badea..a074f53 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -68,7 +68,7 @@ rule mosdepthCoverage: mosdepth {params.prefix} {input.bam} --fast-mode --threads {threads} 2> {log} """ -rule BamStats: +rule bam_stats: """ Calculate mapping statistics with samtools flagstat """ @@ -97,7 +97,7 @@ rule qualimap: wrapper: "v1.25.0/bio/qualimap/bamqc" -rule vcfStats: +rule vcf_stats: input: vcf = "results/vcfs/{dataset}.merged.vcf" output: @@ -110,24 +110,3 @@ rule vcfStats: """ bcftools stats {input.vcf} > {output.stats} 2> {log} """ - -rule reads_per_well: - input: - nb = f"{workflow.basedir}/notebooks/reads-per-well.ipynb", - kernel = "results/.kernel.set", - bam = expand("results/alignments/{sample}.bam", sample=samples), - bai = expand("results/alignments/{sample}.bam.bai", sample=samples), - stats = expand("results/alignments/bamStats/{sample}.flagstat", sample=samples), - metadata = config["metadata"], - output: - nb = "results/notebooks/reads-per-well.ipynb", - docs_nb = "docs/ampseeker-results/notebooks/reads-per-well.ipynb" - conda: - "../envs/AmpSeeker-python.yaml" - log: - "logs/notebooks/reads-per-well.log" - shell: - """ - papermill {input.nb} {output.nb} -k AmpSeq_python -p metadata_path {input.metadata} 2> {log} - cp {output.nb} {output.docs_nb} 2>> {log} - """ From 5d6c7703f546e406fa5ab08d2a94a8986d992523 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Thu, 6 Jul 2023 20:31:54 +0100 Subject: [PATCH 4/5] fix --- .test/config/config.yaml | 2 +- workflow/rules/qc-notebooks.smk | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index e487837..e0944a2 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -5,7 +5,7 @@ dataset: "AgamDao" metadata: "config/metadata.tsv" # Directory of Illumina Miseq Run -illumina-dir: /home/sanj/projects/AmpSeqVIGG2022/resources/220329_M05658_0010_000000000-K9TYL +illumina-dir: "" # Genome fasta reference files reference-name: 'AgamP4' diff --git a/workflow/rules/qc-notebooks.smk b/workflow/rules/qc-notebooks.smk index c87a501..6829f03 100644 --- a/workflow/rules/qc-notebooks.smk +++ b/workflow/rules/qc-notebooks.smk @@ -25,7 +25,6 @@ rule read_qc: nb = f"{workflow.basedir}/notebooks/read-quality.ipynb", kernel = "results/.kernel.set", fastp = expand("results/fastp_reports/{sample}.json", sample=samples), - index_qc = rules.index_read_fastqc.output, metadata = config["metadata"], output: nb = "results/notebooks/read-quality.ipynb", From b29eb9c4b3bbb48389485088822d1988ce51ebf1 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Thu, 6 Jul 2023 20:47:51 +0100 Subject: [PATCH 5/5] remove igv from book for now --- docs/ampseeker-results/_toc.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/ampseeker-results/_toc.yml b/docs/ampseeker-results/_toc.yml index 4322954..d2e1844 100644 --- a/docs/ampseeker-results/_toc.yml +++ b/docs/ampseeker-results/_toc.yml @@ -8,7 +8,6 @@ parts: chapters: - file: notebooks/read-quality - file: notebooks/reads-per-well - - file: notebooks/IGV-explore - caption: Results chapters: - file: notebooks/sample-map