From 0a3181f9c57d7da760826047d7db4b265a6fcb66 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Thu, 12 Sep 2024 10:17:35 +0200
Subject: [PATCH] Apply suggestions from code review

---
 CHANGELOG.md                              |  5 ++--
 README.md                                 |  3 ---
 assets/multiqc_config.yml                 |  4 +--
 conf/modules.config                       | 14 +++++-----
 docs/output.md                            |  6 ++---
 docs/usage.md                             | 31 ++++++++++-------------
 nextflow_schema.json                      |  2 +-
 subworkflows/local/nonpareil.nf           | 17 -------------
 subworkflows/local/visualization_krona.nf |  2 +-
 9 files changed, 30 insertions(+), 54 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 70ecabcf..d53cf0f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,12 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
-- [#417](https://github.com/nf-core/taxprofiler/pull/417) - Added reference-free metagenome estimation with Nonpareil (added by @jfy133)
+- [#417](https://github.com/nf-core/taxprofiler/pull/417) - Added reference-free metagenome complexity/coverage estimation with Nonpareil (added by @jfy133)
 - [#466](https://github.com/nf-core/taxprofiler/pull/466) - Input database sheets now require a `db_type` column to distinguish between short- and long-read databases (added by @LilyAnderssonLee)
 - [#505](https://github.com/nf-core/taxprofiler/pull/505) - Add small files to the file `tower.yml` (added by @LilyAnderssonLee)
 - [#508](https://github.com/nf-core/taxprofiler/pull/508) - Add `nanoq` as a filtering tool for nanopore reads (added by @LilyAnderssonLee)
 - [#511](https://github.com/nf-core/taxprofiler/pull/511) - Add `porechop_abi` as an alternative adapter removal tool for long reads nanopore data (added by @LilyAnderssonLee)
-- [#512](https://github.com/nf-core/taxprofiler/pull/512) - Update all tools to the latest version and include nf-test (Updated by @LilyAnderssonLee & @jfy133)
+- [#512](https://github.com/nf-core/taxprofiler/pull/512) - Update all tools to the latest version and include nf-test (updated by @LilyAnderssonLee & @jfy133)
 
 ### `Fixed`
 
@@ -36,7 +36,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 | minimap2      | 2.24             | 2.28        |
 | motus/profile | 3.0.3            | 3.1.0       |
 | multiqc       | 1.21             | 1.24.1      |
-| nanoq         |                  | 0.10.0      |
 | samtools      | 1.17             | 1.20        |
 | untar         | 4.7              | 4.8         |
 
diff --git a/README.md b/README.md
index 4bf4174a..666d27ac 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,6 @@
 
 **nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic classification and profiling of shotgun short- and long-read metagenomic data. It allows for in-parallel taxonomic identification of reads or taxonomic abundance estimation with multiple classification and profiling tools against multiple databases, and produces standardised output tables for facilitating results comparison between different tools and databases.
 
-The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
-
-On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/scnanoseq/results).
 
 ## Pipeline summary
 
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 8b5de411..82334688 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -64,8 +64,8 @@ custom_logo_title: "nf-core/taxprofiler"
 run_modules:
   - fastqc
   - adapterRemoval
-    - fastp
-    - nonpareil
+  - fastp
+  - nonpareil
   - bbduk
   - prinseqplusplus
   - porechop
diff --git a/conf/modules.config b/conf/modules.config
index d298a828..1b82c9ee 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -255,7 +255,7 @@ process {
             [
                 path: { "${params.outdir}/porechop" },
                 mode: params.publish_dir_mode,
-                pattern: '*.fastq.gz',
+                pattern: '*_porechop.fastq.gz',
                 enabled: params.save_preprocessed_reads
             ],
             [
@@ -266,7 +266,7 @@ process {
             [
                 path: { "${params.outdir}/analysis_ready_fastqs" },
                 mode: params.publish_dir_mode,
-                pattern: '*.fastq.gz',
+                pattern: '*_porechop.fastq.gz',
                 enabled: params.save_analysis_ready_fastqs,
                 saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && params.longread_qc_skipqualityfilter && !params.longread_qc_skipadaptertrim && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
             ]
@@ -279,7 +279,7 @@ process {
             [
                 path: { "${params.outdir}/porechop_abi" },
                 mode: params.publish_dir_mode,
-                pattern: '*.fastq.gz',
+                pattern: '*_porechop_abi.fastq.gz',
                 enabled: params.save_preprocessed_reads
             ],
             [
@@ -290,7 +290,7 @@ process {
             [
                 path: { "${params.outdir}/analysis_ready_fastqs" },
                 mode: params.publish_dir_mode,
-                pattern: '*.fastq.gz',
+                pattern: '*porechop_abi.fastq.gz',
                 enabled: params.save_analysis_ready_fastqs,
                 saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && params.longread_qc_skipqualityfilter && !params.longread_qc_skipadaptertrim && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
             ]
@@ -339,18 +339,18 @@ process {
             [
                 path: { "${params.outdir}/nanoq" },
                 mode: params.publish_dir_mode,
-                pattern: '*.fastq.gz',
+                pattern: '*_filtered.fastq.gz',
                 enabled: params.save_preprocessed_reads
             ],
             [
                 path: { "${params.outdir}/nanoq" },
                 mode: params.publish_dir_mode,
-                pattern: '*.stats'
+                pattern: '*_filtered.stats'
             ],
             [
                 path: { "${params.outdir}/analysis_ready_fastqs" },
                 mode: params.publish_dir_mode,
-                pattern: '*.fastq.gz',
+                pattern: '*_filtered.fastq.gz',
                 enabled: params.save_analysis_ready_fastqs,
                 saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && !params.longread_qc_skipqualityfilter && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
             ]
diff --git a/docs/output.md b/docs/output.md
index 89cafb8f..80b02ecb 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -15,11 +15,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [falco](#fastqc) - Alternative to FastQC for raw read QC
 - [fastp](#fastp) - Adapter trimming for Illumina data
 - [AdapterRemoval](#adapterremoval) - Adapter trimming for Illumina data
-- [Porechop](#porechop) - Adapter removal for Oxford Nanopore data
-- [Porechop_ABI](#porechop_abi) - Adapter removal for Oxford Nanopore data
 - [Nonpareil](#nonpareil) - Read redundancy and metagenome coverage estimation for short reads
 - [BBDuk](#bbduk) - Quality trimming and filtering for Illumina data
 - [PRINSEQ++](#prinseq) - Quality trimming and filtering for Illunina data
+- [Porechop](#porechop) - Adapter removal for Oxford Nanopore data
+- [Porechop_ABI](#porechop_abi) - Adapter removal for Oxford Nanopore data
 - [Filtlong](#filtlong) - Quality trimming and filtering for Nanopore data
 - [Nanoq] (#nanoq) - Quality trimming and filtering for Nanopore data
 - [Bowtie2](#bowtie2) - Host removal for Illumina reads
@@ -155,7 +155,7 @@ The resulting `.fastq` files may _not_ always be the 'final' reads that go into
 
   </details>
 
-  In most cases you will just want to look at the PNG files which contain the extrapolation information for estimating how much of the metagenome 'coverage' you will recover if you sequence more (i.e., to help indicate at what point you will just keep sequencing redundant reads that provide no more new taxonomic information).
+In most cases you will just want to look at the PNG files which contain the extrapolation information for estimating how much of the metagenome 'coverage' you will recover if you sequence more (i.e., to help indicate at what point you will just keep sequencing redundant reads that provide no more new taxonomic information).
 
   The `.npo` files can be used for re-generating and customising the plots using the companion `Nonpareil` R package.
 
diff --git a/docs/usage.md b/docs/usage.md
index cb511840..bb7ff63e 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -274,25 +274,22 @@ Before using this tool please note the following caveats:
 
 :::warning
 
-- It is not recommended to run this on deep sequencing data, or very large datasets
-  - Nonpareil requires uncompressed FASTQ files, and nf-core/taxprofiler will uncompress these in your working directory, potentially with a extremely large hard-drive footprint.
-- Your shortest reads _after_ processing should not go below 24bp (see warning below)
 - It is not recommended to keep unmerged (`--shortread_qc_includeunmerged`) reads when using the calculation.
-
-:::info
-If you get errors regarding the 'kmer' value is not correct, make sure your shortest reads _after_ processing is not less than 24bp.
-
-If this is the case you will need to specify in a custom config
-
-```nextflow
-process {
-  withName: NONPAREIL_NONPAREIL {
-    ext.args = { "-k <NUMBER>" }
+- Your shortest reads _after_ processing should not go below 24bp 
+    
+    If the 'kmer' value is not correct, make sure your shortest reads _after_ processing is not less than 24bp.
+    
+    If this is the case you will need to specify in a custom config
+    
+    ```nextflow
+    process {
+      withName: NONPAREIL_NONPAREIL {
+        ext.args = { "-k <NUMBER>" }
+        }
     }
-}
-```
-
-Where `<NUMBER>` should be at least the shortest read in your library
+    ```
+    
+    Where `<NUMBER>` should be at least the shortest read in your library
 :::
 
 #### Complexity Filtering
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 4fb4ba40..3ada1a56 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -307,7 +307,7 @@
                     "type": "boolean",
                     "description": "Turn on short-read metagenome sequencing redundancy estimation with nonpareil. Warning: only use for shallow short-read sequencing datasets.",
                     "fa_icon": "fas fa-toggle-on",
-                    "help_text": "Turns on [nonpareil](https://nonpareil.readthedocs.io/en/latest/), a tool for estimating metagenome 'coverage', i.e, whether all genomes within the metagenome have had at least one read sequenced.\n\nIt estimates this by checking the read redundancy between a subsample of reads versus other reads in the library.\n\nThe more redundancy that exists, the larger the assumption that all possible reads in the library have been sequenced and all 'redundant' reads are simply sequencing of PCR duplicates.\n\nThe lower the redundancy, the more sequencing should be done until the entire metagenome has been captured. The output can be used to guide the amount of further sequencing is required.\n\nNote this is not the same as _genomic_ coverage, which is the number of times a base-pair is covered by unique reads on a reference genome.\n\nBefore using this tool please note the following caveats:\n\n- It is not recommended to run this on deep sequencing data, or very large datasets\n  - Nonpareil requires uncompressed FASTQ files, and nf-core/taxprofiler will uncompress these in your working directory, potentially with a extremely large hard-drive footprint.\n- Your shortest reads _after_ processing should not go below 24bp (see warning below)\n- It is not recommended to keep unmerged (`--shortread_qc_includeunmerged`) reads when using the calculation.\n:::warning\nOn default settings, with 'kmer mode', you must make sure that your shortest processed reads do not go below 24 bp (the default kmer size).\n\nIf you have errors regarding kmer size, you will need to specify in a custom config in a process block\n\n```\n    withName: NONPAREIL {\n        ext.args = { \"-k <NUMBER>\" }\n    }\n```\n\nWhere `<NUMBER>` should be at least the shortest read in your library\n:::"
+                    "help_text": "Turns on [nonpareil](https://nonpareil.readthedocs.io/en/latest/), a tool for estimating metagenome 'coverage', i.e, whether all genomes within the metagenome have had at least one read sequenced.\n\nIt estimates this by checking the read redundancy between a subsample of reads versus other reads in the library.\n\nThe more redundancy that exists, the larger the assumption that all possible reads in the library have been sequenced and all 'redundant' reads are simply sequencing of PCR duplicates.\n\nThe lower the redundancy, the more sequencing should be done until the entire metagenome has been captured. The output can be used to guide the amount of further sequencing is required.\n\nNote this is not the same as _genomic_ coverage, which is the number of times a base-pair is covered by unique reads on a reference genome.\n\nBefore using this tool please note the following caveats:\n\n- It is not recommended to run this on deep sequencing data, or very large datasets\n - Your shortest reads _after_ processing should not go below 24bp (see warning below)\n- It is not recommended to keep unmerged (`--shortread_qc_includeunmerged`) reads when using the calculation.\n:::warning\nOn default settings, with 'kmer mode', you must make sure that your shortest processed reads do not go below 24 bp (the default kmer size).\n\nIf you have errors regarding kmer size, you will need to specify in a custom config in a process block\n\n```\n    withName: NONPAREIL {\n        ext.args = { \"-k <NUMBER>\" }\n    }\n```\n\nWhere `<NUMBER>` should be at least the shortest read in your library\n:::"
                 },
                 "shortread_redundancyestimation_mode": {
                     "type": "string",
diff --git a/subworkflows/local/nonpareil.nf b/subworkflows/local/nonpareil.nf
index e7900aa5..3489ab09 100644
--- a/subworkflows/local/nonpareil.nf
+++ b/subworkflows/local/nonpareil.nf
@@ -3,23 +3,6 @@ include { NONPAREIL_CURVE            } from '../../modules/nf-core/nonpareil/cur
 include { NONPAREIL_SET              } from '../../modules/nf-core/nonpareil/set/main'
 include { NONPAREIL_NONPAREILCURVESR } from '../../modules/nf-core/nonpareil/nonpareilcurvesr/main'
 
-// Custom Functions
-
-/*
-
-*/
-def extractNonpareilExtensionFromArrays(ch_input) {
-
-return ch_profile
-    .map { meta, profile -> [meta.db_name, meta, profile] }
-    .combine(ch_database, by: 0)
-    .multiMap {
-        key, meta, profile, db_meta, db ->
-            profile: [meta, profile]
-            db: db
-    }
-}
-
 workflow NONPAREIL {
     take:
     reads     // [ [ meta ], [ reads ] ]
diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf
index eb9b6cf6..2b57a702 100644
--- a/subworkflows/local/visualization_krona.nf
+++ b/subworkflows/local/visualization_krona.nf
@@ -99,7 +99,7 @@ workflow VISUALIZATION_KRONA {
 
         KRONA_KTIMPORTTAXONOMY ( ch_krona_taxonomy_for_input, file(params.krona_taxonomy_directory, checkExists: true) )
         ch_krona_html.mix( KRONA_KTIMPORTTAXONOMY.out.html )
-        ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() )
+        ch_versions = ch_versions.mix( GUNZIP.out.versions.first() )
         ch_versions = ch_versions.mix( MEGAN_RMA2INFO_KRONA.out.versions.first() )
         ch_versions = ch_versions.mix( KRONA_KTIMPORTTAXONOMY.out.versions.first() )
     }