diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json deleted file mode 100644 index 4ecfbfe..0000000 --- a/.devcontainer/devcontainer.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "nfcore", - "image": "nfcore/gitpod:latest", - "remoteUser": "gitpod", - "runArgs": ["--privileged"], - - // Configure tool-specific properties. - "customizations": { - // Configure properties specific to VS Code. - "vscode": { - // Set *default* container specific settings.json values on container create. - "settings": { - "python.defaultInterpreterPath": "/opt/conda/bin/python", - "python.linting.enabled": true, - "python.linting.pylintEnabled": true, - "python.formatting.autopep8Path": "/opt/conda/bin/autopep8", - "python.formatting.yapfPath": "/opt/conda/bin/yapf", - "python.linting.flake8Path": "/opt/conda/bin/flake8", - "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", - "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", - "python.linting.pylintPath": "/opt/conda/bin/pylint" - }, - - // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } - } -} diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index aecbd6e..e9013a8 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,17 +1,17 @@ # phac-nml/iridanextexample: Contributing Guidelines Hi there! -Many thanks for taking an interest in improving phac-nml/iridanextexample. +Many thanks for taking an interest in improving phac-nml/fetchdatairidanext. -We try to manage the required tasks for phac-nml/iridanextexample using GitHub issues, you probably came to this page when creating one. +We try to manage the required tasks for phac-nml/fetchdatairidanext using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. ## Contribution workflow -If you'd like to write some code for phac-nml/iridanextexample, the standard workflow is as follows: +If you'd like to write some code for phac-nml/fetchdatairidanext, the standard workflow is as follows: -1. Check that there isn't already an issue about your idea in the [phac-nml/iridanextexample issues](https://github.com/phac-nml/iridanextexample/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this -2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [phac-nml/iridanextexample repository](https://github.com/phac-nml/iridanextexample) to your GitHub account +1. Check that there isn't already an issue about your idea in the [phac-nml/fetchdatairidanext issues](https://github.com/phac-nml/fetchdatairidanext/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this +2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [phac-nml/fetchdatairidanext repository](https://github.com/phac-nml/fetchdatairidanext) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). 5. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged @@ -27,7 +27,7 @@ There are typically two types of tests that run: ### Lint tests -`nf-core` has a [set of guidelines](https://nf-co.re/developers/guidelines) which all pipelines must adhere to. +`phac-nml` has a [set of guidelines](https://github.com/phac-nml/pipeline-standards) which all pipelines must adhere to. These are a subset of the [nf-core set of guidelines](https://nf-co.re/developers/guidelines). To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint ` command. If any failures or warnings are encountered, please follow the listed URL for more documentation. @@ -49,11 +49,11 @@ These tests are run both with the latest available version of `Nextflow` and als ## Getting help -For further information/help, please consult the [phac-nml/iridanextexample documentation](https://github.com/phac-nml/iridanextexample/). +For further information/help, please consult the [phac-nml/fetchdatairidanext documentation](https://github.com/phac-nml/fetchdatairidanext/). ## Pipeline contribution conventions -To make the phac-nml/iridanextexample code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. +To make the phac-nml/fetchdatairidanext code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. ### Adding a new step @@ -67,8 +67,7 @@ If you wish to contribute a new step, please use the following coding standards: 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. 8. If applicable, add a new test command in `.github/workflow/ci.yml`. -9. Update MultiQC config `assets/multiqc_config.yml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. -10. Add a description of the output files and if relevant any appropriate images from the MultiQC report to `docs/output.md`. +9. Add a description of the output files to `docs/output.md`. ### Default values @@ -96,18 +95,3 @@ If you are using a new feature from core Nextflow, you may bump the minimum requ ### Images and figures For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines). - -## GitHub Codespaces - -This repo includes a devcontainer configuration which will create a GitHub Codespaces for Nextflow development! This is an online developer environment that runs in your browser, complete with VSCode and a terminal. - -To get started: - -- Open the repo in [Codespaces](https://github.com/phac-nml/iridanextexample/codespaces) -- Tools installed - - nf-core - - Nextflow - -Devcontainer specs: - -- [DevContainer config](.devcontainer/devcontainer.json) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index f0c034e..2a8df94 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,4 +1,4 @@ contact_links: - name: "GitHub" - url: https://github.com/phac-nml/iridanextexample + url: https://github.com/phac-nml/fetchdatairidanext about: The GitHub page for development. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index e67ccaa..0204616 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -1,5 +1,5 @@ name: Feature request -description: Suggest an idea for the phac-nml/iridanextexample pipeline +description: Suggest an idea for the phac-nml/fetchdatairidanext pipeline labels: enhancement body: - type: textarea diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 63aed5f..146c253 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,21 +1,21 @@ ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/phac-nml/iridanextexample/tree/main/.github/CONTRIBUTING.md) +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/phac-nml/fetchdatairidanext/tree/main/.github/CONTRIBUTING.md) - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index d4ad0e4..d72ca17 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -11,9 +11,9 @@ jobs: steps: # PRs to the phac-nml repo main branch are only ok if coming from the phac-nml repo `dev` or any `patch` branches - name: Check PRs - if: github.repository == 'phac-nml/iridanextexample' + if: github.repository == 'phac-nml/fetchdatairidanext' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == phac-nml/iridanextexample ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == phac-nml/fetchdatairidanext ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 03e1016..745d17e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: test: name: Run pipeline with test data # Only run on push if this is the phac-nml dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'phac-nml/iridanextexample') }}" + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'phac-nml/fetchdatairidanext') }}" runs-on: ubuntu-latest strategy: matrix: diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index b8bdd21..544a5a8 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -22,7 +22,7 @@ jobs: run: npm install -g editorconfig-checker - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') + run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile\|.sra') Prettier: runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 54012e5..861bd49 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,12 @@ .nextflow* work/ -data/ +/data/ results/ .DS_Store testing/ testing* *.pyc *.swp +/.nf-test +/.nf-test.log +ids.csv diff --git a/.nf-core.yml b/.nf-core.yml index eaccb00..8814b7c 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,19 +2,23 @@ repository_type: pipeline lint: files_exist: - - assets/nf-core-iridanextexample_logo_light.png - - docs/images/nf-core-iridanextexample_logo_light.png - - docs/images/nf-core-iridanextexample_logo_dark.png + - assets/nf-core-fetchdatairidanext_logo_light.png + - docs/images/nf-core-fetchdatairidanext_logo_light.png + - docs/images/nf-core-fetchdatairidanext_logo_dark.png - .github/workflows/awstest.yml - .github/workflows/awsfulltest.yml + - CODE_OF_CONDUCT.md files_unchanged: - assets/sendmail_template.txt - assets/email_template.html - lib/NfcoreTemplate.groovy - .github/CONTRIBUTING.md - .github/ISSUE_TEMPLATE/bug_report.yml + - .github/ISSUE_TEMPLATE/feature_request.yml - .github/PULL_REQUEST_TEMPLATE.md - .github/workflows/branch.yml + - .github/workflows/linting.yml + - .gitignore - assets/email_template.txt - docs/README.md - LICENSE diff --git a/CHANGELOG.md b/CHANGELOG.md index b6443cc..52eb926 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,25 +1,10 @@ -# phac-nml/iridanextexample: Changelog +# phac-nml/fetchdatairidanext: Changelog The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## 1.0.2 - 2023/12/18 +## [1.0.0] - 2024-01-26 -- Removed GitHub workflows that weren't needed. -- Adding additional parameters for testing purposes. +### Added -## 1.0.1 - 2023/12/06 - -Allowing non-gzipped FASTQ files as input. Default branch is now main. - -## 1.0.0 - 2023/11/30 - -Initial release of phac-nml/iridanextexample, created with the [nf-core](https://nf-co.re/) template. - -### `Added` - -### `Fixed` - -### `Dependencies` - -### `Deprecated` +- Initial release of fetchdatairidanext pipeline which will download reads from NCBI/INSDC archives. diff --git a/CITATIONS.md b/CITATIONS.md index 5c12d47..ff45ad4 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,4 +1,4 @@ -# phac-nml/iridanextexample: Citations +# phac-nml/fetchdatairidanext: Citations ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) @@ -8,8 +8,13 @@ > Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311. +## [nf-test](https://www.nf-test.com/) + ## Pipeline tools +- [NCBI sra-tools](https://github.com/ncbi/sra-tools) +- [nf-core fastq_download_prefetch_fasterqdump_sratools subworkflow](https://nf-co.re/subworkflows/fastq_download_prefetch_fasterqdump_sratools) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index c089ec7..0000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,182 +0,0 @@ -# Code of Conduct at nf-core (v1.4) - -## Our Pledge - -In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: - -- Age -- Ability -- Body size -- Caste -- Familial status -- Gender identity and expression -- Geographical location -- Level of experience -- Nationality and national origins -- Native language -- Neurodiversity -- Race or ethnicity -- Religion -- Sexual identity and orientation -- Socioeconomic status - -Please note that the list above is alphabetised and is therefore not ranked in any order of preference or importance. - -## Preamble - -:::note -This Code of Conduct (CoC) has been drafted by Renuka Kudva, Cris Tuñí, and Michael Heuer, with input from the nf-core Core Team and Susanna Marquez from the nf-core community. "We", in this document, refers to the Safety Officers and members of the nf-core Core Team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will be amended periodically to keep it up-to-date. In case of any dispute, the most current version will apply. -::: - -An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). - -Our Safety Officers are Saba Nafees, Cris Tuñí, and Michael Heuer. - -nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. - -We have therefore adopted this CoC, which we require all members of our community and attendees of nf-core events to adhere to in all our workspaces at all times. Workspaces include, but are not limited to, Slack, meetings on Zoom, gather.town, YouTube live etc. - -Our CoC will be strictly enforced and the nf-core team reserves the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. - -We ask all members of our community to help maintain supportive and productive workspaces and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. - -Questions, concerns, or ideas on what we can include? Contact members of the Safety Team on Slack or email safety [at] nf-co [dot] re. - -## Our Responsibilities - -Members of the Safety Team (the Safety Officers) are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. - -The Safety Team, in consultation with the nf-core core team, have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this CoC, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. - -Members of the core team or the Safety Team who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and will be subject to the same actions as others in violation of the CoC. - -## When and where does this Code of Conduct apply? - -Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events, such as hackathons, workshops, bytesize, and collaborative workspaces on gather.town. These guidelines include, but are not limited to, the following (listed alphabetically and therefore in no order of preference): - -- Communicating with an official project email address. -- Communicating with community members within the nf-core Slack channel. -- Participating in hackathons organised by nf-core (both online and in-person events). -- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence, and on the nf-core gather.town workspace. -- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, gather.town, Jitsi, YouTube live etc. -- Representing nf-core on social media. This includes both official and personal accounts. - -## nf-core cares 😊 - -nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include, but are not limited to, the following (listed in alphabetical order): - -- Ask for consent before sharing another community member’s personal information (including photographs) on social media. -- Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. -- Celebrate your accomplishments! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) -- Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) -- Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) -- Focus on what is best for the team and the community. (When in doubt, ask) -- Accept feedback, yet be unafraid to question, deliberate, and learn. -- Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) -- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communication to be kind.**) -- Take breaks when you feel like you need them. -- Use welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack) - -## nf-core frowns on 😕 - -The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this CoC. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces: - -- Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. -- “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. -- Spamming or trolling of individuals on social media. -- Use of sexual or discriminatory imagery, comments, jokes, or unwelcome sexual attention. -- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion, or work experience. - -### Online Trolling - -The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the risk of online trolling. This is unacceptable — reports of such behaviour will be taken very seriously and perpetrators will be excluded from activities immediately. - -All community members are **required** to ask members of the group they are working with for explicit consent prior to taking screenshots of individuals during video calls. - -## Procedures for reporting CoC violations - -If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. - -You can reach out to members of the Safety Team (Saba Nafees, Cris Tuñí, and Michael Heuer) on Slack. Alternatively, contact a member of the nf-core core team [nf-core core team](https://nf-co.re/about), and they will forward your concerns to the Safety Team. - -Issues directly concerning members of the Core Team or the Safety Team will be dealt with by other members of the core team and the safety manager — possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson and details will be shared in due course. - -All reports will be handled with the utmost discretion and confidentiality. - -You can also report any CoC violations to safety [at] nf-co [dot] re. In your email report, please do your best to include: - -- Your contact information. -- Identifying information (e.g. names, nicknames, pseudonyms) of the participant who has violated the Code of Conduct. -- The behaviour that was in violation and the circumstances surrounding the incident. -- The approximate time of the behaviour (if different than the time the report was made). -- Other people involved in the incident, if applicable. -- If you believe the incident is ongoing. -- If there is a publicly available record (e.g. mailing list record, a screenshot). -- Any additional information. - -After you file a report, one or more members of our Safety Team will contact you to follow up on your report. - -## Who will read and handle reports - -All reports will be read and handled by the members of the Safety Team at nf-core. - -If members of the Safety Team are deemed to have a conflict of interest with a report, they will be required to recuse themselves as per our Code of Conduct and will not have access to any follow-ups. - -To keep this first report confidential from any of the Safety Team members, please submit your first report by direct messaging on Slack/direct email to any of the nf-core members you are comfortable disclosing the information to, and be explicit about which member(s) you do not consent to sharing the information with. - -## Reviewing reports - -After receiving the report, members of the Safety Team will review the incident report to determine whether immediate action is required, for example, whether there is immediate threat to participants’ safety. - -The Safety Team, in consultation with members of the nf-core core team, will assess the information to determine whether the report constitutes a Code of Conduct violation, for them to decide on a course of action. - -In the case of insufficient information, one or more members of the Safety Team may contact the reporter, the reportee, or any other attendees to obtain more information. - -Once additional information is gathered, the Safety Team will collectively review and decide on the best course of action to take, if any. The Safety Team reserves the right to not act on a report. - -## Confidentiality - -All reports, and any additional information included, are only shared with the team of safety officers (and possibly members of the core team, in case the safety officer is in violation of the CoC). We will respect confidentiality requests for the purpose of protecting victims of abuse. - -We will not name harassment victims, beyond discussions between the safety officer and members of the nf-core team, without the explicit consent of the individuals involved. - -## Enforcement - -Actions taken by the nf-core’s Safety Team may include, but are not limited to: - -- Asking anyone to stop a behaviour. -- Asking anyone to leave the event and online spaces either temporarily, for the remainder of the event, or permanently. -- Removing access to the gather.town and Slack, either temporarily or permanently. -- Communicating to all participants to reinforce our expectations for conduct and remind what is unacceptable behaviour; this may be public for practical reasons. -- Communicating to all participants that an incident has taken place and how we will act or have acted — this may be for the purpose of letting event participants know we are aware of and dealing with the incident. -- Banning anyone from participating in nf-core-managed spaces, future events, and activities, either temporarily or permanently. -- No action. - -## Attribution and Acknowledgements - -- The [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4) -- The [OpenCon 2017 Code of Conduct](http://www.opencon2017.org/code_of_conduct) (CC BY 4.0 OpenCon organisers, SPARC and Right to Research Coalition) -- The [eLife innovation sprint 2020 Code of Conduct](https://sprint.elifesciences.org/code-of-conduct/) -- The [Mozilla Community Participation Guidelines v3.1](https://www.mozilla.org/en-US/about/governance/policies/participation/) (version 3.1, CC BY-SA 3.0 Mozilla) - -## Changelog - -### v1.4 - February 8th, 2022 - -- Included a new member of the Safety Team. Corrected a typographical error in the text. - -### v1.3 - December 10th, 2021 - -- Added a statement that the CoC applies to nf-core gather.town workspaces. Corrected typographical errors in the text. - -### v1.2 - November 12th, 2021 - -- Removed information specific to reporting CoC violations at the Hackathon in October 2021. - -### v1.1 - October 14th, 2021 - -- Updated with names of new Safety Officers and specific information for the hackathon in October 2021. - -### v1.0 - March 15th, 2021 - -- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/LICENSE b/LICENSE index ae9c66b..0ca6cdb 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Aaron Petkau +Copyright (c) Government of Canada Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 82f76d2..e9ca09f 100644 --- a/README.md +++ b/README.md @@ -1,101 +1,106 @@ [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A523.04.3-brightgreen.svg)](https://www.nextflow.io/) -# Example Pipeline for IRIDA Next +# fetchdatairidanext pipeline -This is an example pipeline to be used for integration with IRIDA Next. +This pipeline can be used to fetch data from NCBI for integration into [IRIDA Next][irida-next]. # Input The input to the pipeline is a standard sample sheet (passed as `--input samplesheet.csv`) that looks like: -| sample | fastq_1 | fastq_2 | -| ------- | --------------- | --------------- | -| SampleA | file_1.fastq.gz | file_2.fastq.gz | +| sample | insdc_accession | +| ------- | --------------- | +| SampleA | ERR1109373 | +| SampleB | SRR13191702 | -The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/). +That is, there are two columns: + +- **sample**: The sample identifier downloaded read data should be associated with. +- **insdc_accession**: The accession from the [International Sequence Data Collaboration (INSDC)][insdc] for the data to download (currently only sequence runs supported, e.g., starting with `SRR`, `ERR`, or `DRR`). + +The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). An example of this file is provided at [assets/samplesheet.csv](assets/samplesheet.csv). # Parameters -The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run. +The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers (or `-profile docker` for docker) and `-r [branch]` to specify which GitHub branch you would like to run. -Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schmea.json). +Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schema.json). # Running -To run the pipeline, please do: +## Test data + +To run the pipeline with test data, please do: + +```bash +nextflow run phac-nml/fetchdatairidanext -profile test,docker --outdir results +``` + +The downloaded data will appear in `results/`. A JSON file for integrating data with IRIDA Next will be written to `results/iridanext.output.json.gz` (see the [Output](#output) section for details). + +## Other data + +To run the pipeline with other data (a custom samplesheet), please do: ```bash -nextflow run phac-nml/iridanextexample -profile singularity -r main -latest --input assets/samplesheet.csv --outdir results +nextflow run phac-nml/fetchdatairidanext -profile docker --input assets/samplesheet.csv --outdir results ``` Where the `samplesheet.csv` is structured as specified in the [Input](#input) section. # Output -A JSON file for loading metadata into IRIDA Next is output by this pipeline. The format of this JSON file is specified in our [Pipeline Standards for the IRIDA Next JSON](https://github.com/phac-nml/pipeline-standards#32-irida-next-json). This JSON file is written directly within the `--outdir` provided to the pipeline with the name `irida.output.json.gz` (ex: `[outdir]/irida.output.json.gz`). +## Read data -An example of the what the contents of the IRIDA Next JSON file looks like for this particular pipeline is as follows: +The sequence reads will appear in the `results/reads` directory (assuming `--outdir results` is specified). For example: ``` +results/reads/ +├── ERR1109373.fastq.gz +├── ERR1109373_1.fastq.gz +├── ERR1109373_2.fastq.gz +├── SRR13191702_1.fastq.gz +└── SRR13191702_2.fastq.gz +``` + +## IRIDA Next integration file + +A JSON file for loading the data into IRIDA Next is output by this pipeline. The format of this JSON file is specified in our [Pipeline Standards for the IRIDA Next JSON](https://github.com/phac-nml/pipeline-standards#32-irida-next-json). This JSON file is written directly within the `--outdir` provided to the pipeline with the name `irida.output.json.gz` (ex: `[outdir]/irida.output.json.gz`). + +```json { - "files": { - "global": [ - { - "path": "summary/summary.txt.gz" - } - ], - "samples": { - "SAMPLE1": [ - { - "path": "assembly/SAMPLE1.assembly.fa.gz" - } - ], - "SAMPLE2": [ - { - "path": "assembly/SAMPLE2.assembly.fa.gz" - } - ], - "SAMPLE3": [ - { - "path": "assembly/SAMPLE3.assembly.fa.gz" - } - ] - } - }, - "metadata": { - "samples": { - "SAMPLE1": { - "reads.1": "sample1_R1.fastq.gz", - "reads.2": "sample1_R2.fastq.gz" - }, - "SAMPLE2": { - "reads.1": "sample2_R1.fastq.gz", - "reads.2": "sample2_R2.fastq.gz" - }, - "SAMPLE3": { - "reads.1": "sample1_R1.fastq.gz", - "reads.2": "null" - } - } + "files": { + "global": [], + "samples": { + "SampleA": [{ "path": "reads/SRR13191702_1.fastq.gz" }, { "path": "reads/SRR13191702_2.fastq.gz" }] } + } } ``` -Within the `files` section of this JSON file, all of the output paths are relative to the `outdir`. Therefore, `"path": "assembly/SAMPLE1.assembly.fa.gz"` refers to a file located within `outdir/assembly/SAMPLE1.assembly.fa.gz`. +Within the `files` section of this JSON file, all of the output paths are relative to the `--outdir results`. Therefore, `"path": "reads/SRR13191702_1.fastq.gz"` refers to a file located within `sratools/reads/SRR13191702_1.fastq.gz`. -There is also a pipeline execution summary output file provided (specified in the above JSON as `"global": [{"path":"summary/summary.txt.gz"}]`). However, there is no formatting specification for this file. +An additional example of this file can be found at [tests/data/test1_iridanext.output.json](tests/data/test1_iridanext.output.json). -## Test profile +# Acknowledgements -To run with the test profile, please do: +This pipeline uses code and infrastructure developed and maintained by the [nf-core][nf-core] initative, and reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). -```bash -nextflow run phac-nml/iridanextexample -profile docker,test -r main -latest --outdir results -``` +> The nf-core framework for community-curated bioinformatics pipelines. +> +> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. +> +> Nat Biotechnol. 2020 Feb 13. doi: 10.1038/s41587-020-0439-x. + +In addition, references of tools and data used in this pipeline are as follows: + +- The [fastq_download_prefetch_fasterqdump_sratools](https://nf-co.re/subworkflows/fastq_download_prefetch_fasterqdump_sratools) subworkflow from nf-core. Custom modifications to this workflow (and underlying modules) are found in the [subworkflows/local](subworkflows/local) and [modules/local](modules/local) directories. + +Other works this pipeline makes use of are found in the [CITATIONS.md](CITATIONS.md) file. # Legal -Copyright 2023 Government of Canada +Copyright 2024 Government of Canada Licensed under the MIT License (the "License"); you may not use this work except in compliance with the License. You may obtain a copy of the @@ -107,3 +112,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +[irida-next]: https://github.com/phac-nml/irida-next +[insdc]: https://www.insdc.org/ +[nf-core]: https://nf-co.re/ diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index cc1689a..798ed52 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,13 +1,13 @@ report_comment: > - This report has been generated by the phac-nml/iridanextexample + This report has been generated by the phac-nml/fetchdatairidanext analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: - "phac-nml-iridanextexample-methods-description": + "phac-nml-fetchdatairidanext-methods-description": order: -1000 software_versions: order: -1001 - "phac-nml-iridanextexample-summary": + "phac-nml-fetchdatairidanext-summary": order: -1002 export_plots: true diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 814a27d..fdf9dae 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,3 @@ -sample,fastq_1,fastq_2 -SAMPLE1,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R2.fastq.gz -SAMPLE2,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample2_R1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample2_R2.fastq.gz -SAMPLE3,https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/illumina/amplicon/sample1_R1.fastq.gz, +sample,insdc_accession +SAMPLE1,ERR1109373 +SAMPLE2,SRR13191702 diff --git a/assets/schema_input.json b/assets/schema_input.json index b1dd3fc..edcf572 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/phac-nml/iridanextexample/main/assets/schema_input.json", - "title": "phac-nml/iridanextexample pipeline - params.input schema", + "$id": "https://raw.githubusercontent.com/phac-nml/fetchdatairidanext/main/assets/schema_input.json", + "title": "phac-nml/fetchdatairidanext pipeline - params.input schema", "description": "Schema for the file provided with params.input", "type": "array", "items": { @@ -14,25 +14,13 @@ "unique": true, "errorMessage": "Sample name must be provided and cannot contain spaces" }, - "fastq_1": { + "insdc_accession": { "type": "string", - "pattern": "^\\S+\\.f(ast)?q(\\.gz)?$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have the extension: '.fq', '.fastq', '.fq.gz' or '.fastq.gz'" - }, - "fastq_2": { - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have the extension: '.fq', '.fastq', '.fq.gz' or '.fastq.gz'", - "anyOf": [ - { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q(\\.gz)?$" - }, - { - "type": "string", - "maxLength": 0 - } - ] + "pattern": "^(SRR|ERR|DRR)\\S+$", + "meta": ["insdc_accession"], + "errorMessage": "Must provide a valid accession" } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "insdc_accession"] } } diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index 4a758fe..0000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - - """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/irida-next-output.py b/bin/irida-next-output.py deleted file mode 100755 index 32acd36..0000000 --- a/bin/irida-next-output.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python - -import json -from pathlib import Path -from mimetypes import guess_type -from functools import partial -import gzip -import sys -import argparse -import os -import glob - - -def get_open(f): - if "gzip" == guess_type(str(f))[1]: - return partial(gzip.open) - else: - return open - - -def main(argv=None): - parser = argparse.ArgumentParser( - description="Creates example output JSON for loading into IRIDA Next", - epilog="Example: python irida-next-output.py --json-output output.json *.json *.json.gz", - ) - parser.add_argument("files", nargs="+") - parser.add_argument( - "--summary-file", - action="store", - dest="summary_file", - type=str, - help="pipeline summary file", - default=None, - required=True, - ) - parser.add_argument( - "--json-output", - action="store", - dest="json_output", - type=str, - help="JSON output file", - default=None, - required=True, - ) - - args = parser.parse_args(argv) - - json_output_file = Path(args.json_output) - if json_output_file.exists(): - sys.stderr.write(f"Error: --json-output [{json_output_file}] exists") - return 1 - - # Not checking for the existance of the summary file - # because the path may be relative to the outdir, which we don't have here. - - input_files = args.files - if isinstance(input_files, str): - input_files = [input_files] - - output_dict = { - "files": { - "summary": {}, - "samples": {}, - }, - "metadata": { - "samples": {}, - }, - } - - output_metadata = { - "files": {"global": [{"path": str(args.summary_file)}], "samples": {}}, - "metadata": {"samples": {}}, - } - - for f in input_files: - _open = get_open(f) - with _open(f, "r") as fh: - sample_metadata = json.load(fh) - output_metadata["files"]["samples"] |= sample_metadata["files"]["samples"] - output_metadata["metadata"]["samples"] |= sample_metadata["metadata"]["samples"] - - data_json = json.dumps(output_metadata, sort_keys=True, indent=4) - _open = get_open(json_output_file) - with _open(json_output_file, "wt") as oh: - oh.write(data_json) - - print(f"Output written to [{json_output_file}]") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/simplify_irida_json.py b/bin/simplify_irida_json.py deleted file mode 100755 index c486625..0000000 --- a/bin/simplify_irida_json.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python - -import json -import argparse -import sys -import gzip -from mimetypes import guess_type -from functools import partial -from pathlib import Path - - -def flatten_dictionary(dictionary): - result = {} - - def flatten(item, name=""): - if type(item) is dict: - for component in item: - flatten(item[component], str(name) + str(component) + ".") - - elif type(item) is list: - for i in range(len(item)): - flatten(item[i], str(name) + str(i + 1) + ".") # i + 1 because biologists - - else: - result[str(name)[:-1]] = item # [:-1] avoids the "." appended on the previous recursion - - flatten(dictionary) - return result - - -def main(): - parser = argparse.ArgumentParser( - description="Simplifies JSON files for use with IRIDA Next", - epilog="Example: python simplify_irida_json.py --json-output output.json input.json", - ) - parser.add_argument("input") - parser.add_argument( - "--json-output", - action="store", - dest="json_output", - type=str, - help="JSON output file", - default=None, - required=True, - ) - - args = parser.parse_args() - - json_output_location = Path(args.json_output) - if json_output_location.exists(): - sys.stderr.write("Error: --json-output [{json_output_location}] exists!\n") - return 1 - - json_input_file = args.input - - # Handle GZIP and non-GZIP - encoding = guess_type(json_input_file)[1] - open_file = partial(gzip.open, mode="rt") if encoding == "gzip" else open # partial (function pointer) - - with open_file(json_input_file) as input_file: - input_json = json.load(input_file) - - # Flatten metadata: - for sample in input_json["metadata"]["samples"]: - input_json["metadata"]["samples"][sample] = flatten_dictionary(input_json["metadata"]["samples"][sample]) - - json_data = json.dumps(input_json, sort_keys=True, indent=4) - with open(json_output_location, "w") as output_file: - output_file.write(json_data) - - print("Output written to " + str(json_output_location) + "!") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/conf/iridanext.config b/conf/iridanext.config new file mode 100644 index 0000000..216c6cb --- /dev/null +++ b/conf/iridanext.config @@ -0,0 +1,12 @@ +iridanext { + enabled = true + output { + path = "${params.outdir}/iridanext.output.json.gz" + overwrite = true + validate = true + files { + idkey = "id" + samples = ["**/reads/*.fastq.gz"] + } + } +} diff --git a/conf/modules.config b/conf/modules.config index 1fbf485..dd1fdfe 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,53 +12,29 @@ process { - // Publish directory names - assembly_directory_name = "assembly" - summary_directory_name = "summary" - publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SAMPLESHEET_CHECK { + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + pattern: '*_versions.yml' ] } - withName: ASSEMBLY_STUB { - publishDir = [ - path: { ["${params.outdir}", "${task.assembly_directory_name}"].join(File.separator) }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: SRATOOLS_PREFETCH { + maxForks = params.max_jobs_with_network_connections } - withName: GENERATE_SUMMARY { - publishDir = [ - path: { ["${params.outdir}", "${task.summary_directory_name}"].join(File.separator) }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: IRIDA_NEXT_OUTPUT { + withName: SRATOOLS_FASTERQDUMP { publishDir = [ path: { "${params.outdir}" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: CUSTOM_DUMPSOFTWAREVERSIONS { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - pattern: '*_versions.yml' + pattern: 'reads/*.fastq.gz' ] } } diff --git a/conf/test.config b/conf/test.config index 52f31db..ee4ed3d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,5 +20,5 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/phac-nml/iridanextexample/main/assets/samplesheet.csv' + input = 'https://raw.githubusercontent.com/phac-nml/fetchdatairidanext/dev/assets/samplesheet.csv' } diff --git a/conf/test_full.config b/conf/test_full.config index 319becf..cac35dd 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,5 +15,5 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/phac-nml/iridanextexample/main/assets/samplesheet.csv' + input = 'https://raw.githubusercontent.com/phac-nml/fetchdatairidanext/dev/assets/samplesheet.csv' } diff --git a/docs/README.md b/docs/README.md index db77372..67788e8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ -# phac-nml/iridanextexample: Documentation +# phac-nml/fetchdatairidanext: Documentation -The phac-nml/iridanextexample documentation is split into the following pages: +The phac-nml/fetchdatairidanext documentation is split into the following pages: - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png deleted file mode 100755 index 361d0e4..0000000 Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png deleted file mode 100755 index cb39ebb..0000000 Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png deleted file mode 100755 index a4b89bf..0000000 Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ diff --git a/docs/output.md b/docs/output.md index 69a1069..7e95254 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,4 +1,4 @@ -# phac-nml/iridanextexample: Output +# phac-nml/fetchdatairidanext: Output ## Introduction @@ -6,11 +6,10 @@ This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -- assembly: very small mock assembly files for each sample -- generate: intermediate files used in generating the IRIDA Next JSON output -- pipeline_info: information about the pipeline's execution -- simplify: simplified intermediate files used in generating the IRIDA Next JSON output -- summary: summary report about the pipeline's execution and results +- `sratools`: Data from the SRA tools step (downloading sequence reads). +- `reads`: The fastq files of downloaded reads. +- `pipeline_info`: information about the pipeline's execution +- `custom`: information on detected/generated NCBI settings used for accessing certain databases (see ). The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.gz` and will be written to the top-level of the results directory. This file is compressed using GZIP and conforms to the [IRIDA Next JSON output specifications](https://github.com/phac-nml/pipeline-standards#42-irida-next-json). @@ -18,50 +17,31 @@ The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.g The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [Assembly stub](#assembly-stub) - Performs a stub assembly by generating a mock assembly -- [Generate sample JSON](#generate-sample-json) - Generates a JSON file for each sample -- [Generate summary](#generate-summary) - Generates a summary text file describing the samples and assemblies -- [Simplify IRIDA JSON](#simplify-irida-json) - Simplifies the sample JSONs by limiting nesting depth -- [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next +- [Reads download](#prefetch-fasterq) - Downloads data from INSDC databases (using NCBI's SRA Tools). - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next -### Assembly stub - -
-Output files - -- `assembly/` - - Mock assembly files: `ID.assembly.fa.gz` - -
- -### Generate sample JSON - -
-Output files - -- `generate/` - - JSON files: `ID.json.gz` - -
- -### Generate summary +### Reads download
Output files -- `summary/` - - Text summary describing samples and assemblies: `summary.txt.gz` +- `sratools/` + - Sequence data in SRA format: `INSDC_ACCESSION/INSDC_ACCESSION.sra` +- `reads/` + - Reads in fastq format: `INSDC_ACCESSION.fastq.gz`
-### Simplify IRIDA JSON +### Pipeline information
Output files -- `simplify/` - - Simplified JSON files: `ID.simple.json.gz` +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Parameters used by the pipeline run: `params.json`.
@@ -75,17 +55,4 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d -### Pipeline information - -
-Output files - -- `pipeline_info/` - - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - - Parameters used by the pipeline run: `params.json`. - -
- [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. diff --git a/docs/run-wes-example.json b/docs/run-wes-example.json index 7807de1..519cf60 100644 --- a/docs/run-wes-example.json +++ b/docs/run-wes-example.json @@ -1,18 +1,19 @@ { "workflow_params": { - "--input": "[SAMPLESHEET]", - "-r": "main" + "--input": "az://samplesheet.csv", + "-r": "1.0.0" }, - "workflow_type": "DSL2", - "workflow_type_version": "22.10.7", + "workflow_type": "NFL", + "workflow_type_version": "DSL2", + "workflow_engine": "nextflow", + "workflow_engine_version": "23.10.0", "tags": { "createdBy": "TestUser", "group": "TestUserGroup" }, "workflow_engine_parameters": { - "engine": "nextflow", "execute_loc": "azure" }, - "workflow_url": "https://github.com/phac-nml/iridanextexample", + "workflow_url": "https://github.com/phac-nml/fetchdatairidanext", "workflow_attachment": "" } diff --git a/docs/usage.md b/docs/usage.md index 5563e59..a261f28 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,12 +1,12 @@ -# phac-nml/iridanextexample: Usage +# phac-nml/fetchdatairidanext: Usage ## Introduction -This pipeline is an example that illustrates running a nf-core-compliant pipeline on IRIDA Next. +This pipeline is used to download read data from INSDC databases. ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with two columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -14,22 +14,20 @@ You will need to create a samplesheet with information about the samples you wou ### Full samplesheet -The input samplesheet must contain three columns: `ID`, `fastq_1`, `fastq_2`. The IDs within a samplesheet should be unique. All other columns will be ignored. +The input samplesheet must contain two columns: `sample`, `insdc_accession`. The sample entries within a samplesheet should be unique. All other columns will be ignored. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. +An example samplesheet is shown below: ```console -sample,fastq_1,fastq_2 -SAMPLE1,sample1_R1.fastq.gz,sample1_R2.fastq.gz -SAMPLE2,sample2_R1.fastq.gz,sample2_R2.fastq.gz -SAMPLE3,sample1_R1.fastq.gz, +sample,insdc_accession +SAMPLE1,ERR1109373 +SAMPLE2,SRR13191702 ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. Samples should be unique within a samplesheet. | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------ | +| `sample` | A sample name which will be associated with downloaded reads. Samples should be unique within a samplesheet. | +| `insdc_accession` | The accession (run accession) from one of the INSDC databases (NCBI, ENA, or DDBJ). | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -38,10 +36,10 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: ```bash -nextflow run main.nf --input ./samplesheet.csv --outdir ./results -profile singularity +nextflow run phac-nml/fetchdatairidanext -profile test,docker --outdir results ``` -This will launch the pipeline with the `singularity` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `docker` configuration profile (use `singularity` for singularity profile). See below for more information about profiles. Note that the pipeline will create the following files in your working directory: @@ -62,7 +60,7 @@ Do not use `-c ` to specify parameters as this will result in errors. Cust The above pipeline run specified with a params file in yaml format: ```bash -nextflow run phac-nml/iridanextexample -profile docker -params-file params.yaml +nextflow run phac-nml/fetchdatairidanext -profile docker -params-file params.yaml ``` with `params.yaml` containing: @@ -79,7 +77,7 @@ You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-c It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [phac-nml/iridanextexample page](https://github.com/phac-nml/iridanextexample) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. +First, go to the [phac-nml/fetchdatairidanext page](https://github.com/phac-nml/fetchdatairidanext) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. diff --git a/lib/WorkflowIridanextexample.groovy b/lib/WorkflowFetchdatairidanext.groovy similarity index 92% rename from lib/WorkflowIridanextexample.groovy rename to lib/WorkflowFetchdatairidanext.groovy index 482515f..9ac5b58 100755 --- a/lib/WorkflowIridanextexample.groovy +++ b/lib/WorkflowFetchdatairidanext.groovy @@ -1,11 +1,11 @@ // -// This file holds several functions specific to the workflow/iridanextexample.nf in the phac-nml/iridanextexample pipeline +// This file holds several functions specific to the workflow/fetchdatairidanext.nf in the phac-nml/fetchdatairidanext pipeline // import nextflow.Nextflow import groovy.text.SimpleTemplateEngine -class WorkflowIridanextExample { +class WorkflowFetchdatairidanext { // // Check and validate parameters diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index d00351f..6ae8059 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -1,5 +1,5 @@ // -// This file holds several functions specific to the main.nf workflow in the phac-nml/iridanextexample pipeline +// This file holds several functions specific to the main.nf workflow in the phac-nml/fetchdatairidanext pipeline // import nextflow.Nextflow diff --git a/main.nf b/main.nf index d687e9d..dfe6737 100644 --- a/main.nf +++ b/main.nf @@ -1,9 +1,9 @@ #!/usr/bin/env nextflow /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - phac-nml/iridanextexample + phac-nml/fetchdatairidanext ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Github : https://github.com/phac-nml/iridanextexample + Github : https://github.com/phac-nml/fetchdatairidanext ---------------------------------------------------------------------------------------- */ @@ -21,7 +21,7 @@ include { validateParameters; paramsHelp; paramsSummaryLog; fromSamplesheet } fr if (params.help) { def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' - def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv -profile docker" log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) System.exit(0) } @@ -39,13 +39,13 @@ WorkflowMain.initialise(workflow, params, log) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { IRIDANEXT } from './workflows/iridanextexample' +include { FETCHDATAIRIDANEXT } from './workflows/fetchdatairidanext' // // WORKFLOW: Run main phac-nml/iridanextexample analysis pipeline // -workflow PHACNML_IRIDANEXT { - IRIDANEXT () +workflow PHACNML_FETCHDATAIRIDANEXT { + FETCHDATAIRIDANEXT () } /* @@ -59,7 +59,7 @@ workflow PHACNML_IRIDANEXT { // See: https://github.com/nf-core/rnaseq/issues/619 // workflow { - PHACNML_IRIDANEXT () + PHACNML_FETCHDATAIRIDANEXT () } /* diff --git a/modules.json b/modules.json index 3a890bb..6250d22 100644 --- a/modules.json +++ b/modules.json @@ -9,6 +9,16 @@ "branch": "master", "git_sha": "1526dc37227a1101bdca25339337362e187a6b3b", "installed_by": ["modules"] + }, + "custom/sratoolsncbisettings": { + "branch": "master", + "git_sha": "e719354ba77df0a1bd310836aa2039b45c29d620", + "installed_by": ["modules"] + }, + "sratools/prefetch": { + "branch": "master", + "git_sha": "e719354ba77df0a1bd310836aa2039b45c29d620", + "installed_by": ["modules"] } } } diff --git a/modules/local/assemblystub/main.nf b/modules/local/assemblystub/main.nf deleted file mode 100644 index 00f27d2..0000000 --- a/modules/local/assemblystub/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process ASSEMBLY_STUB { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.assembly.fa.gz"), emit: assembly - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - cat <<-EOF > ${prefix}.assembly.fa - >${meta.id}-stub-assembly - ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTTAAAAACCCCCGGGGGTTTTT - EOF - - gzip -n ${prefix}.assembly.fa - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - assemblystub : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/generatesamplejson/main.nf b/modules/local/generatesamplejson/main.nf deleted file mode 100644 index f3b5cd3..0000000 --- a/modules/local/generatesamplejson/main.nf +++ /dev/null @@ -1,49 +0,0 @@ -process GENERATE_SAMPLE_JSON { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(reads), path(assembly) - - output: - tuple val(meta), path("*.json.gz"), emit: json - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def assembly_path = ["${task.assembly_directory_name}", "${assembly}"].join(File.separator) - """ - cat <<-EOF > "${meta.id}.json" - { - "files": { - "samples": { - "${meta.id}": [ - { - "path": "${assembly_path}" - } - ] - } - }, - "metadata": { - "samples": { - "${meta.id}": { - "reads": ["${reads[0]}", "${reads[1]}"] - } - } - } - } - EOF - gzip ${meta.id}.json - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - generatesamplejson : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/generatesummary/main.nf b/modules/local/generatesummary/main.nf deleted file mode 100644 index a3d0245..0000000 --- a/modules/local/generatesummary/main.nf +++ /dev/null @@ -1,38 +0,0 @@ -process GENERATE_SUMMARY { - label 'process_single' - container 'docker.io/python:3.9.17' - - input: - val summaries - - output: - path("summary.txt.gz"), emit: summary - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def sorted_summaries = summaries.sort{ it[0].id } - - // Generate summary text: - def summary_text = "IRIDANEXTEXAMPLE Pipeline Summary\n\nSUCCESS!\n" - - // TODO: Consider the possibility of code injection. - // Should probably be moved to file processing through Python. - for (summary in sorted_summaries) { - summary_text += "\n${summary[0].id}:\n" - summary_text += " reads.1: ${summary[1][0]}\n" - summary_text += " reads.2: ${summary[1][1]}\n" - summary_text += " assembly: ${summary[2]}\n" - } - - version_text = "\"${task.process}\":\n generatesummary : 0.1.0" - - """ - echo "${summary_text}" > summary.txt - gzip -n summary.txt - echo "${version_text}" > versions.yml - """ -} diff --git a/modules/local/iridanextoutput/main.nf b/modules/local/iridanextoutput/main.nf deleted file mode 100644 index 92595ee..0000000 --- a/modules/local/iridanextoutput/main.nf +++ /dev/null @@ -1,31 +0,0 @@ -process IRIDA_NEXT_OUTPUT { - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - path(samples_data) - - output: - path("iridanext.output.json.gz"), emit: output_json - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def samples_data_dir = "samples_data" - """ - irida-next-output.py \\ - $args \\ - --summary-file ${task.summary_directory_name}/summary.txt.gz \\ - --json-output iridanext.output.json.gz \\ - ${samples_data} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - iridanextoutput : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index 5fb6717..0000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,31 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in phac-nml/iridanextexample/bin/ - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/simplifyiridajson/main.nf b/modules/local/simplifyiridajson/main.nf deleted file mode 100644 index e2e7352..0000000 --- a/modules/local/simplifyiridajson/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process SIMPLIFY_IRIDA_JSON { - tag "$meta.id" - label 'process_single' - - container 'docker.io/python:3.9.17' - - input: - tuple val(meta), path(json) - - output: - tuple val(meta), path("*.simple.json.gz") , emit: simple_json - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - simplify_irida_json.py \\ - $args \\ - --json-output ${meta.id}.simple.json \\ - ${json} - - gzip ${meta.id}.simple.json - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - simplifyiridajson : 0.1.0 - END_VERSIONS - """ -} diff --git a/modules/local/sratools/fasterqdump/environment.yml b/modules/local/sratools/fasterqdump/environment.yml new file mode 100644 index 0000000..4011b69 --- /dev/null +++ b/modules/local/sratools/fasterqdump/environment.yml @@ -0,0 +1,8 @@ +name: sratools_fasterqdump +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sra-tools=3.0.8 + - conda-forge::pigz=2.6 diff --git a/modules/local/sratools/fasterqdump/main.nf b/modules/local/sratools/fasterqdump/main.nf new file mode 100644 index 0000000..66ba956 --- /dev/null +++ b/modules/local/sratools/fasterqdump/main.nf @@ -0,0 +1,61 @@ +process SRATOOLS_FASTERQDUMP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' : + 'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' }" + + input: + tuple val(meta), path(sra) + path ncbi_settings + path certificate + + output: + tuple val(meta), path('reads/*.fastq.gz'), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def key_file = '' + + if (certificate.toString().endsWith('.jwt')){ + key_file += " --perm ${certificate}" + } + else if (certificate.toString().endsWith('.ngc')){ + key_file += " --ngc ${certificate}" + } + + """ + export NCBI_SETTINGS="\$PWD/${ncbi_settings}" + + # Make directory ahead of time since otherwise + # fasterq-dump does not set correct permissions/owner + mkdir -p reads + + fasterq-dump \\ + $args \\ + --threads $task.cpus \\ + --outdir reads \\ + ${key_file} \\ + ${sra} + + pigz \\ + $args2 \\ + --no-name \\ + --processes $task.cpus \\ + reads/*.fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sratools: \$(fasterq-dump --version 2>&1 | grep -Eo '[0-9.]+') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/local/sratools/fasterqdump/meta.yml b/modules/local/sratools/fasterqdump/meta.yml new file mode 100644 index 0000000..b5e0175 --- /dev/null +++ b/modules/local/sratools/fasterqdump/meta.yml @@ -0,0 +1,53 @@ +name: sratools_fasterqdump +description: Extract sequencing reads in FASTQ format from a given NCBI Sequence Read Archive (SRA). +keywords: + - sequencing + - FASTQ + - dump +tools: + - sratools: + description: SRA Toolkit and SDK from NCBI + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ["Public Domain"] +input: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - sra: + type: directory + description: Directory containing ETL data for the given SRA. + pattern: "*/*.sra" + - ncbi_settings: + type: file + description: > + An NCBI user settings file. + + pattern: "*.mkfg" + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + + pattern: "*.cart" +output: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Extracted FASTQ file or files if the sequencing reads are paired-end. + pattern: "*.fastq.gz" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/custom/sratoolsncbisettings/environment.yml b/modules/nf-core/custom/sratoolsncbisettings/environment.yml new file mode 100644 index 0000000..44a1b00 --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/environment.yml @@ -0,0 +1,7 @@ +name: custom_sratoolsncbisettings +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sra-tools=3.0.8 diff --git a/modules/nf-core/custom/sratoolsncbisettings/main.nf b/modules/nf-core/custom/sratoolsncbisettings/main.nf new file mode 100644 index 0000000..ba9441c --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/main.nf @@ -0,0 +1,20 @@ +process CUSTOM_SRATOOLSNCBISETTINGS { + tag 'ncbi-settings' + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sra-tools:3.0.8--h9f5acd7_0' : + 'biocontainers/sra-tools:3.0.8--h9f5acd7_0' }" + + output: + path('*.mkfg') , emit: ncbi_settings + path 'versions.yml', emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" + template 'detect_ncbi_settings.sh' +} diff --git a/modules/nf-core/custom/sratoolsncbisettings/meta.yml b/modules/nf-core/custom/sratoolsncbisettings/meta.yml new file mode 100644 index 0000000..46a6cd3 --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/meta.yml @@ -0,0 +1,28 @@ +name: "custom_sratoolsncbisettings" +description: Test for the presence of suitable NCBI settings or create them on the fly. +keywords: + - NCBI + - settings + - sra-tools + - prefetch + - fasterq-dump +tools: + - "sratools": + description: "SRA Toolkit and SDK from NCBI" + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ["Public Domain"] +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - ncbi_settings: + type: file + description: An NCBI user settings file. + pattern: "*.mkfg" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh b/modules/nf-core/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh new file mode 100644 index 0000000..cfe3a32 --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -u + + +# Get the expected NCBI settings path and define the environment variable +# `NCBI_SETTINGS`. +eval "$(vdb-config -o n NCBI_SETTINGS | sed 's/[" ]//g')" + +# If the user settings do not exist yet, create a file suitable for `prefetch` +# and `fasterq-dump`. If an existing settings file does not contain the required +# values, error out with a helpful message. +if [[ ! -f "${NCBI_SETTINGS}" ]]; then + printf '!{config}' > 'user-settings.mkfg' +else + prefetch --help &> /dev/null + if [[ $? = 78 ]]; then + echo "You have an existing vdb-config at '${NCBI_SETTINGS}' but it is"\ + "missing the required entries for /LIBS/GUID and"\ + "/libs/cloud/report_instance_identity."\ + "Feel free to add the following to your settings file:" >&2 + echo "$(printf '!{config}')" >&2 + exit 1 + fi + fasterq-dump --help &> /dev/null + if [[ $? = 78 ]]; then + echo "You have an existing vdb-config at '${NCBI_SETTINGS}' but it is"\ + "missing the required entries for /LIBS/GUID and"\ + "/libs/cloud/report_instance_identity."\ + "Feel free to add the following to your settings file:" >&2 + echo "$(printf '!{config}')" >&2 + exit 1 + fi + if [[ "${NCBI_SETTINGS}" != *.mkfg ]]; then + echo "The detected settings '${NCBI_SETTINGS}' do not have the required"\ + "file extension '.mkfg'." >&2 + exit 1 + fi + cp "${NCBI_SETTINGS}" ./ +fi + +cat <<-END_VERSIONS > versions.yml +"!{task.process}": + sratools: $(vdb-config --version 2>&1 | grep -Eo '[0-9.]+') +END_VERSIONS diff --git a/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test b/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test new file mode 100644 index 0000000..9f17867 --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test @@ -0,0 +1,42 @@ +nextflow_process { + + name "Test Process CUSTOM_SRATOOLSNCBISETTINGS" + script "../main.nf" + process "CUSTOM_SRATOOLSNCBISETTINGS" + config "modules/nf-core/custom/sratoolsncbisettings/tests/nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/sratoolsncbisettings" + + test("Should run without failures") { + + when { + params { + settings_path = '/tmp/.ncbi' + settings_file = "${params.settings_path}/user-settings.mkfg" + outdir = "$outputDir" + } + + process { + """ + file(params.settings_path).mkdirs() + def settings = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + settings.copyTo(params.settings_file) + """ + } + } + + then { + assert process.success + assert snapshot( + process.out.versions + ).match() + + with(process.out.ncbi_settings) { + assert path(get(0)).readLines().any { it.contains('/LIBS/GUID') } + assert path(get(0)).readLines().any { it.contains('/libs/cloud/report_instance_identity') } + } + } + } +} diff --git a/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test.snap b/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test.snap new file mode 100644 index 0000000..65a706d --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,3d6ee88cce1ee517e198633f062589a8" + ] + ], + "timestamp": "2024-01-09T22:43:44.996003" + } +} diff --git a/modules/nf-core/custom/sratoolsncbisettings/tests/nextflow.config b/modules/nf-core/custom/sratoolsncbisettings/tests/nextflow.config new file mode 100644 index 0000000..c4a96e9 --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/tests/nextflow.config @@ -0,0 +1,16 @@ +params.settings_path = '/tmp/.ncbi' +params.settings_file = "${params.settings_path}/user-settings.mkfg" + +env.NCBI_SETTINGS = params.settings_file + +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + withName: CUSTOM_SRATOOLSNCBISETTINGS { + containerOptions = { + (workflow.containerEngine == 'singularity') ? + "-B ${params.settings_path}:${params.settings_path}" : + "-v ${params.settings_path}:${params.settings_path}" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/sratoolsncbisettings/tests/tags.yml b/modules/nf-core/custom/sratoolsncbisettings/tests/tags.yml new file mode 100644 index 0000000..fb4a08a --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/tests/tags.yml @@ -0,0 +1,2 @@ +custom/sratoolsncbisettings: + - modules/nf-core/custom/sratoolsncbisettings/** diff --git a/modules/nf-core/sratools/prefetch/environment.yml b/modules/nf-core/sratools/prefetch/environment.yml new file mode 100644 index 0000000..cfc7d9a --- /dev/null +++ b/modules/nf-core/sratools/prefetch/environment.yml @@ -0,0 +1,7 @@ +name: sratools_prefetch +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sra-tools=3.0.8 diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf new file mode 100644 index 0000000..3c30739 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -0,0 +1,35 @@ +process SRATOOLS_PREFETCH { + tag "$id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sra-tools:3.0.8--h9f5acd7_0' : + 'biocontainers/sra-tools:3.0.8--h9f5acd7_0' }" + + input: + tuple val(meta), val(id) + path ncbi_settings + path certificate + + output: + tuple val(meta), path(id), emit: sra + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + args = task.ext.args ?: '' + args2 = task.ext.args2 ?: '5 1 100' // + if (certificate) { + if (certificate.toString().endsWith('.jwt')) { + args += " --perm ${certificate}" + } + else if (certificate.toString().endsWith('.ngc')) { + args += " --ngc ${certificate}" + } + } + + template 'retry_with_backoff.sh' +} diff --git a/modules/nf-core/sratools/prefetch/meta.yml b/modules/nf-core/sratools/prefetch/meta.yml new file mode 100644 index 0000000..ff54229 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/meta.yml @@ -0,0 +1,56 @@ +name: sratools_prefetch +description: Download sequencing data from the NCBI Sequence Read Archive (SRA). +keywords: + - sequencing + - fastq + - prefetch +tools: + - sratools: + description: SRA Toolkit and SDK from NCBI + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ["Public Domain"] +input: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - id: + type: string + description: > + A string denoting an SRA id. + + - ncbi_settings: + type: file + description: > + An NCBI user settings file. + + pattern: "*.mkfg" + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + + pattern: "*.cart" +output: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - sra: + type: directory + description: > + Directory containing the ETL data for the given SRA id. + + pattern: "*/*.sra" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh new file mode 100755 index 0000000..a72a4bf --- /dev/null +++ b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +set -u + +retry_with_backoff() { + local max_attempts=${1} + local delay=${2} + local max_time=${3} + local attempt=1 + local output= + local status= + + # Remove the first three arguments to this function in order to access + # the 'real' command with `${@}`. + shift 3 + + while [ ${attempt} -le ${max_attempts} ]; do + output=$("${@}") + status=${?} + + if [ ${status} -eq 0 ]; then + break + fi + + if [ ${attempt} -lt ${max_attempts} ]; then + echo "Failed attempt ${attempt} of ${max_attempts}. Retrying in ${delay} s." >&2 + sleep ${delay} + elif [ ${attempt} -eq ${max_attempts} ]; then + echo "Failed after ${attempt} attempts." >&2 + return ${status} + fi + + attempt=$(( ${attempt} + 1 )) + delay=$(( ${delay} * 2 )) + if [ ${delay} -ge ${max_time} ]; then + delay=${max_time} + fi + done + + echo "${output}" +} + +export NCBI_SETTINGS="$PWD/!{ncbi_settings}" + +retry_with_backoff !{args2} \ + prefetch \ + !{args} \ + !{id} + +[ -f !{id}.sralite ] && vdb-validate !{id}.sralite || vdb-validate !{id} + +cat <<-END_VERSIONS > versions.yml +"!{task.process}": + sratools: $(prefetch --version 2>&1 | grep -Eo '[0-9.]+') +END_VERSIONS diff --git a/modules/nf-core/sratools/prefetch/tests/main.nf.test b/modules/nf-core/sratools/prefetch/tests/main.nf.test new file mode 100644 index 0000000..ed710ba --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/main.nf.test @@ -0,0 +1,55 @@ +nextflow_process { + name "Test Process SRATOOLS_PREFETCH" + script "../main.nf" + process "SRATOOLS_PREFETCH" + tag "modules" + tag "modules_nfcore" + tag "sratools" + tag "sratools/prefetch" + + test("sratools/prefetch") { + + when { + params { + outdir = "output" + } + process { + """ + input[0] = Channel.of([ [ id:'test', single_end:false ], 'DRR000774' ]) + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sratools/prefetch with sralite") { + + when { + params { + outdir = "output" + } + process { + """ + input[0] = Channel.of([ [ id:'test', single_end:false ], 'SRR1170046' ]) + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap b/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap new file mode 100644 index 0000000..ab1d208 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sratools/prefetch with sralite": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "SRR1170046.sralite:md5,7acfce556ca0951aff49d780899c105b" + ] + ] + ], + "1": [ + "versions.yml:md5,c967dea4135cb75490e1e801c4639efc" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "SRR1170046.sralite:md5,7acfce556ca0951aff49d780899c105b" + ] + ] + ], + "versions": [ + "versions.yml:md5,c967dea4135cb75490e1e801c4639efc" + ] + } + ], + "timestamp": "2023-10-13T12:11:24.563510389" + }, + "sratools/prefetch": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "DRR000774.sra:md5,7647dba20c89c0e3d7ad13842f060eb0" + ] + ] + ], + "1": [ + "versions.yml:md5,c967dea4135cb75490e1e801c4639efc" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "DRR000774.sra:md5,7647dba20c89c0e3d7ad13842f060eb0" + ] + ] + ], + "versions": [ + "versions.yml:md5,c967dea4135cb75490e1e801c4639efc" + ] + } + ], + "timestamp": "2023-10-13T12:11:02.75256571" + } +} \ No newline at end of file diff --git a/modules/nf-core/sratools/prefetch/tests/tags.yml b/modules/nf-core/sratools/prefetch/tests/tags.yml new file mode 100644 index 0000000..52110bf --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/tags.yml @@ -0,0 +1,2 @@ +sratools/prefetch: + - modules/nf-core/sratools/prefetch/** diff --git a/nextflow.config b/nextflow.config index 2970dcd..0ad21a7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,9 +11,6 @@ params { // Input options input = null - project_name = 'assembly' - assembler = 'stub' - random_seed = 1 // Boilerplate options outdir = null @@ -40,15 +37,19 @@ params { max_time = '1.h' // Schema validation default options - validationFailUnrecognisedParams = false - validationLenientMode = false - validationSchemaIgnoreParams = 'genomes,igenomes_base' - validationShowHiddenParams = false - validate_params = true + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = true + + // Options for limiting network activity + max_jobs_with_network_connections = 1 } // Load base.config by default for all pipelines includeConfig 'conf/base.config' +includeConfig 'conf/iridanext.config' profiles { debug { @@ -155,6 +156,7 @@ singularity.registry = 'quay.io' // Nextflow plugins plugins { id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-iridanext' } // Export these variables to prevent local Python/R libraries from conflicting with those in the container @@ -190,13 +192,13 @@ dag { } manifest { - name = 'phac-nml/iridanextexample' + name = 'phac-nml/fetchdatairidanext' author = """Aaron Petkau and Eric Marinier""" - homePage = 'https://github.com/phac-nml/iridanextexample' - description = """IRIDA Next Example Pipeline""" + homePage = 'https://github.com/phac-nml/fetchdatairidanext' + description = """IRIDA Next pipeline for fetching data from NCBI""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0.2' + version = '1.0.0' doi = '' defaultBranch = 'main' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 28d0d69..e7709da 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,8 +1,8 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/phac-nml/iridanextexample/main/nextflow_schema.json", - "title": "phac-nml/iridanextexample pipeline parameters", - "description": "IRIDA Next Example Pipeline", + "$id": "https://raw.githubusercontent.com/phac-nml/fetchdatairidanext/main/nextflow_schema.json", + "title": "phac-nml/fetchdatairidanext pipeline parameters", + "description": "IRIDA Next NCBI Download pipeline", "type": "object", "definitions": { "input_output_options": { @@ -27,35 +27,16 @@ "type": "string", "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" - }, - "project_name": { - "type": "string", - "default": "assembly", - "pattern": "^\\S+$", - "description": "The name of the project.", - "fa_icon": "fas fa-tag" - }, - "assembler": { - "type": "string", - "default": "stub", - "fa_icon": "fas fa-desktop", - "description": "The sequence assembler to use for sequence assembly.", - "enum": ["default", "stub", "experimental"] - }, - "random_seed": { - "type": "integer", - "default": 1, - "fa_icon": "fas fa-dice-six", - "description": "The random seed to use for sequence assembly.", - "minimum": 1 + "fa_icon": "fas fa-folder-open", + "hidden": true }, "email": { "type": "string", "description": "Email address for completion summary.", "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "hidden": true } } }, @@ -102,7 +83,7 @@ "max_cpus": { "type": "integer", "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, + "default": 4, "fa_icon": "fas fa-microchip", "hidden": true, "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" @@ -110,7 +91,7 @@ "max_memory": { "type": "string", "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", + "default": "2.GB", "fa_icon": "fas fa-memory", "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", "hidden": true, @@ -119,7 +100,7 @@ "max_time": { "type": "string", "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", + "default": "1.h", "fa_icon": "far fa-clock", "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, @@ -209,6 +190,13 @@ "description": "Validation of parameters in lenient more.", "hidden": true, "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." + }, + "max_jobs_with_network_connections": { + "type": "integer", + "default": 1, + "minimum": 1, + "description": "Maximum number of jobs with network connections allowed to run at once", + "hidden": true } } } diff --git a/nf-test.config b/nf-test.config index 870799d..2fa82ad 100644 --- a/nf-test.config +++ b/nf-test.config @@ -3,6 +3,6 @@ config { testsDir "tests" workDir ".nf-test" configFile "tests/nextflow.config" - profile "" + profile "docker" } diff --git a/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/main.nf new file mode 100644 index 0000000..68b718e --- /dev/null +++ b/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -0,0 +1,39 @@ +include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/custom/sratoolsncbisettings/main' +include { SRATOOLS_PREFETCH } from '../../../modules/nf-core/sratools/prefetch/main' +include { SRATOOLS_FASTERQDUMP } from '../../../modules/local/sratools/fasterqdump/main' + +// +// Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). +// +workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { + take: + ch_sra_ids // channel: [ val(meta), val(id) ] + ch_dbgap_key // channel: [ path(dbgap_key) ] + + main: + + ch_versions = Channel.empty() + + // + // Detect existing NCBI user settings or create new ones. + // + CUSTOM_SRATOOLSNCBISETTINGS() + ch_ncbi_settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings + ch_versions = ch_versions.mix(CUSTOM_SRATOOLSNCBISETTINGS.out.versions) + + // + // Prefetch sequencing reads in SRA format. + // + SRATOOLS_PREFETCH ( ch_sra_ids, ch_ncbi_settings, ch_dbgap_key ) + ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) + + // + // Convert the SRA format into one or more compressed FASTQ files. + // + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, ch_ncbi_settings, ch_dbgap_key ) + ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) + + emit: + reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/meta.yml b/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/meta.yml new file mode 100644 index 0000000..1b968ac --- /dev/null +++ b/subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/meta.yml @@ -0,0 +1,52 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: fastq_download_prefetch_fasterqdump_sratools +description: Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). +keywords: + - SRA + - NCBI + - sequencing + - fastq + - prefetch + - fasterq-dump +components: + - custom/sratoolsncbisettings + - sratools/prefetch + - sratools/fasterqdump +input: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - id: + type: string + description: > + SRA run identifier. + + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + + pattern: "*.cart" +# TODO Update when we decide on a standard for subworkflow docs +output: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - reads: + type: file + description: Extracted FASTQ file or files if the sequencing reads are paired-end. + pattern: "*.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" + - "@drpatelh" +maintainers: + - "@Midnighter" + - "@drpatelh" diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index 0aecf87..0000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,44 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return fastq_meta -} diff --git a/tests/data/ERR1109373.sra b/tests/data/ERR1109373.sra new file mode 100644 index 0000000..bca7207 Binary files /dev/null and b/tests/data/ERR1109373.sra differ diff --git a/tests/data/SAMPLE1.json.gz b/tests/data/SAMPLE1.json.gz deleted file mode 100644 index 2311b81..0000000 Binary files a/tests/data/SAMPLE1.json.gz and /dev/null differ diff --git a/tests/data/SAMPLE1.simple.json.gz b/tests/data/SAMPLE1.simple.json.gz deleted file mode 100644 index f61c6c6..0000000 Binary files a/tests/data/SAMPLE1.simple.json.gz and /dev/null differ diff --git a/tests/data/SAMPLE2.simple.json.gz b/tests/data/SAMPLE2.simple.json.gz deleted file mode 100644 index 9b2057e..0000000 Binary files a/tests/data/SAMPLE2.simple.json.gz and /dev/null differ diff --git a/tests/data/SAMPLE3.simple.json.gz b/tests/data/SAMPLE3.simple.json.gz deleted file mode 100644 index f49eb9e..0000000 Binary files a/tests/data/SAMPLE3.simple.json.gz and /dev/null differ diff --git a/tests/data/samplesheet.csv b/tests/data/samplesheet.csv new file mode 100644 index 0000000..fdf9dae --- /dev/null +++ b/tests/data/samplesheet.csv @@ -0,0 +1,3 @@ +sample,insdc_accession +SAMPLE1,ERR1109373 +SAMPLE2,SRR13191702 diff --git a/tests/data/test1_iridanext.output.json b/tests/data/test1_iridanext.output.json new file mode 100644 index 0000000..cfa4828 --- /dev/null +++ b/tests/data/test1_iridanext.output.json @@ -0,0 +1,33 @@ +{ + "files": { + "global": [ + + ], + "samples": { + "SAMPLE2": [ + { + "path": "reads/SRR13191702_2.fastq.gz" + }, + { + "path": "reads/SRR13191702_1.fastq.gz" + } + ], + "SAMPLE1": [ + { + "path": "reads/ERR1109373_2.fastq.gz" + }, + { + "path": "reads/ERR1109373_1.fastq.gz" + }, + { + "path": "reads/ERR1109373.fastq.gz" + } + ] + } + }, + "metadata": { + "samples": { + + } + } +} \ No newline at end of file diff --git a/tests/modules/fasterqdump/main.nf.test b/tests/modules/fasterqdump/main.nf.test new file mode 100644 index 0000000..104c31f --- /dev/null +++ b/tests/modules/fasterqdump/main.nf.test @@ -0,0 +1,30 @@ +nextflow_process { + name "Test Process SRATOOLS_FASTERQDUMP" + script "modules/local/sratools/fasterqdump/main.nf" + process "SRATOOLS_FASTERQDUMP" + tag "modules" + tag "modules_nfcore" + tag "sratools/fasterqdump" + + test("Test fasterqdump ERR1109373") { + + when { + process { + """ + input[0] = Channel.of([ [id: "SAMPLE1"], file("$baseDir/tests/data/ERR1109373.sra", checkIfExists: true) ]) + input[1] = [] + input[2] = [] + """ + } + + params { + outdir = "fasterqdump_test1_out" + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/tests/modules/fasterqdump/main.nf.test.snap b/tests/modules/fasterqdump/main.nf.test.snap new file mode 100644 index 0000000..6c6e406 --- /dev/null +++ b/tests/modules/fasterqdump/main.nf.test.snap @@ -0,0 +1,39 @@ +{ + "Test fasterqdump ERR1109373": { + "content": [ + { + "0": [ + [ + { + "id": "SAMPLE1" + }, + [ + "ERR1109373.fastq.gz:md5,b9acccb3c5d317a99f604375a09991aa", + "ERR1109373_1.fastq.gz:md5,38e94cf16bf3d3b7d3a05a4c21b8fcc3", + "ERR1109373_2.fastq.gz:md5,031a82c07f35c45ada5b6133b1181e68" + ] + ] + ], + "1": [ + "versions.yml:md5,a3d61a9761e1606ef8459f0b68821d7a" + ], + "reads": [ + [ + { + "id": "SAMPLE1" + }, + [ + "ERR1109373.fastq.gz:md5,b9acccb3c5d317a99f604375a09991aa", + "ERR1109373_1.fastq.gz:md5,38e94cf16bf3d3b7d3a05a4c21b8fcc3", + "ERR1109373_2.fastq.gz:md5,031a82c07f35c45ada5b6133b1181e68" + ] + ] + ], + "versions": [ + "versions.yml:md5,a3d61a9761e1606ef8459f0b68821d7a" + ] + } + ], + "timestamp": "2024-01-25T23:20:38.2550152" + } +} \ No newline at end of file diff --git a/tests/modules/local/assemblystub/main.nf.test b/tests/modules/local/assemblystub/main.nf.test deleted file mode 100644 index 881bf56..0000000 --- a/tests/modules/local/assemblystub/main.nf.test +++ /dev/null @@ -1,38 +0,0 @@ -nextflow_process { - - name "Test Process ASSEMBLY_STUB" - script "modules/local/assemblystub/main.nf" - process "ASSEMBLY_STUB" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")]) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert assembly.size() == 1 - - // parse assembly file - def assembly_header = path(assembly.get(0)[1]).linesGzip[0] - def assembly_body = path(assembly.get(0)[1]).linesGzip[1] - - assert assembly_header.equals(">SAMPLE1-stub-assembly") - assert assembly_body.equals("ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTTAAAAACCCCCGGGGGTTTTT") - } - } - - } - -} diff --git a/tests/modules/local/generatesamplejson/main.nf.test b/tests/modules/local/generatesamplejson/main.nf.test deleted file mode 100644 index ba25484..0000000 --- a/tests/modules/local/generatesamplejson/main.nf.test +++ /dev/null @@ -1,40 +0,0 @@ -nextflow_process { - - name "Test Process GENERATE_SAMPLE_JSON" - script "modules/local/generatesamplejson/main.nf" - process "GENERATE_SAMPLE_JSON" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")], file("SAMPLE1.assembly.fa.gz")) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert json.size() == 1 - - // parse output json file - def sample_json_string = path(json.get(0)[1]).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def sample_json = parser.parseText(sample_json_string) - - assert sample_json.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - assert sample_json.metadata.samples.SAMPLE1.reads[0].equals("sample1_R1.fastq.gz") - assert sample_json.metadata.samples.SAMPLE1.reads[1].equals("sample1_R2.fastq.gz") - } - } - - } - -} diff --git a/tests/modules/local/generatesummary/main.nf.test b/tests/modules/local/generatesummary/main.nf.test deleted file mode 100644 index b2eb189..0000000 --- a/tests/modules/local/generatesummary/main.nf.test +++ /dev/null @@ -1,37 +0,0 @@ -nextflow_process { - - name "Test Process GENERATE_SUMMARY" - script "modules/local/generatesummary/main.nf" - process "GENERATE_SUMMARY" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = [new Tuple(["id": "SAMPLE1"], [file("sample1_R1.fastq.gz"), file("sample1_R2.fastq.gz")], file("SAMPLE1.assembly.fa.gz"))] - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert summary.size() == 1 - - assert path(summary.get(0)).linesGzip[0].equals("IRIDANEXTEXAMPLE Pipeline Summary") - assert path(summary.get(0)).linesGzip[4].equals("SAMPLE1:") - assert path(summary.get(0)).linesGzip[5].contains("reads.1: ") - assert path(summary.get(0)).linesGzip[6].contains("reads.2: ") - assert path(summary.get(0)).linesGzip[7].contains("assembly: ") - } - } - - } - -} diff --git a/tests/modules/local/iridanextoutput/main.nf.test b/tests/modules/local/iridanextoutput/main.nf.test deleted file mode 100644 index 72808ab..0000000 --- a/tests/modules/local/iridanextoutput/main.nf.test +++ /dev/null @@ -1,51 +0,0 @@ -nextflow_process { - - name "Test Process IRIDA_NEXT_OUTPUT" - script "modules/local/iridanextoutput/main.nf" - process "IRIDA_NEXT_OUTPUT" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = [file("$baseDir/tests/data/SAMPLE1.simple.json.gz"), file("$baseDir/tests/data/SAMPLE2.simple.json.gz"), file("$baseDir/tests/data/SAMPLE3.simple.json.gz")] - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert output_json.size() == 1 - - // parse output json file - def json_string = path(output_json.get(0)).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def irida_json = parser.parseText(json_string) - - assert irida_json.files.global[0].path.equals("summary/summary.txt.gz") - - assert irida_json.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - assert irida_json.files.samples.SAMPLE2[0].path.equals("assembly/SAMPLE2.assembly.fa.gz") - assert irida_json.files.samples.SAMPLE3[0].path.equals("assembly/SAMPLE3.assembly.fa.gz") - - assert irida_json.metadata.samples.SAMPLE1.'reads.1'.equals("sample1_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE1.'reads.2'.equals("sample1_R2.fastq.gz") - - assert irida_json.metadata.samples.SAMPLE2.'reads.1'.equals("sample2_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE2.'reads.2'.equals("sample2_R2.fastq.gz") - - assert irida_json.metadata.samples.SAMPLE3.'reads.1'.equals("sample1_R1.fastq.gz") - assert irida_json.metadata.samples.SAMPLE3.'reads.2'.equals("null") - } - } - - } - -} diff --git a/tests/modules/local/simplifyiridajson/main.nf.test b/tests/modules/local/simplifyiridajson/main.nf.test deleted file mode 100644 index 7d61567..0000000 --- a/tests/modules/local/simplifyiridajson/main.nf.test +++ /dev/null @@ -1,41 +0,0 @@ -nextflow_process { - - name "Test Process SIMPLIFY_IRIDA_JSON" - script "modules/local/simplifyiridajson/main.nf" - process "SIMPLIFY_IRIDA_JSON" - - test("Basic execution, check output.") { - - when { - params { - outdir = "tests/results" - } - process { - """ - input[0] = new Tuple(["id": "SAMPLE1"], file("$baseDir/tests/data/SAMPLE1.json.gz")) - """ - } - } - - then { - assert process.success - - with(process.out) { - // check if emitted output has been created - assert simple_json.size() == 1 - - // parse output json file - def json_string = path(simple_json.get(0)[1]).linesGzip.join("\n") - def parser = new groovy.json.JsonSlurper() - def json_simple = parser.parseText(json_string) - - assert json_simple.files.samples.SAMPLE1[0].path.equals("assembly/SAMPLE1.assembly.fa.gz") - - assert json_simple.metadata.samples.SAMPLE1.'reads.1'.equals("sample1_R1.fastq.gz") - assert json_simple.metadata.samples.SAMPLE1.'reads.2'.equals("sample1_R2.fastq.gz") - } - } - - } - -} diff --git a/tests/nextflow.config b/tests/nextflow.config index c19b1ad..91f0548 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -3,3 +3,6 @@ Nextflow config file for running tests ======================================================================================== */ + +/* Remove gzipping on JSON output for testing/asserts on file contents */ +iridanext.output.path = "${params.outdir}/iridanext.output.json" diff --git a/tests/pipelines/fetchdatairidanext.nf.test b/tests/pipelines/fetchdatairidanext.nf.test new file mode 100644 index 0000000..698dba9 --- /dev/null +++ b/tests/pipelines/fetchdatairidanext.nf.test @@ -0,0 +1,28 @@ +nextflow_pipeline { + + name "Test fetching small datasets from NCBI" + script "main.nf" + + test("basic integration test") { + + when { + params { + input = "$baseDir/tests/data/samplesheet.csv" + outdir = "test1_out" + } + } + + then { + assert workflow.success + + // IRIDA Next output file + assert path("$launchDir/test1_out/iridanext.output.json").json == path("$baseDir/tests/data/test1_iridanext.output.json").json + + // Output data + assert path("$launchDir/test1_out/reads/ERR1109373_1.fastq.gz").linesGzip.size() == 512 + assert path("$launchDir/test1_out/reads/ERR1109373_2.fastq.gz").linesGzip.size() == 512 + assert path("$launchDir/test1_out/reads/SRR13191702_1.fastq.gz").linesGzip.size() == 364 + assert path("$launchDir/test1_out/reads/SRR13191702_2.fastq.gz").linesGzip.size() == 364 + } + } +} diff --git a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test new file mode 100644 index 0000000..cf160e1 --- /dev/null +++ b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test @@ -0,0 +1,37 @@ +nextflow_workflow { + + name "Test workflow: fastq_download_prefetch_fasterqdump_sratools/main.nf" + script "subworkflows/local/fastq_download_prefetch_fasterqdump_sratools/main.nf" + workflow "FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS" + tag "subworkflow" + tag "subworkflow_nfcore" + tag "custom/sratoolsncbisettings" + tag "sratools/prefetch" + tag "sratools/fasterqdump" + tag "fastq_download_prefetch_fasterqdump_sratools" + + test("Parameters: default") { + + when { + workflow { + """ + input[0] = Channel.of( + [[ id:'test_single_end', single_end:true ], 'DRR000774'], + [[ id:'test_paired_end', single_end:false ], 'SRR11140744'] + ) + input[1] = [] + """ + } + params { + outdir = "output" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + } +} diff --git a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap new file mode 100644 index 0000000..8c168d8 --- /dev/null +++ b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/main.nf.test.snap @@ -0,0 +1,59 @@ +{ + "Parameters: default": { + "content": [ + { + "0": [ + [ + { + "id": "test_paired_end", + "single_end": false + }, + [ + "SRR11140744.fastq.gz:md5,96a821b351f3a450889a38e7099f7b95", + "SRR11140744_1.fastq.gz:md5,8573015c91d099b6e30789f8bab2f43c", + "SRR11140744_2.fastq.gz:md5,37e6f719a022dc3c9994c80fbc20c311" + ] + ], + [ + { + "id": "test_single_end", + "single_end": true + }, + "DRR000774.fastq.gz:md5,a110f93f7a9b0271455f5a435bce73c7" + ] + ], + "1": [ + "versions.yml:md5,1a2218ff913fc33408bffccb081b5048", + "versions.yml:md5,98d78bba9f3da39a0b7db6e9c7dcc224", + "versions.yml:md5,9c558ff624585a6eee82a19c8c0136db" + ], + "reads": [ + [ + { + "id": "test_paired_end", + "single_end": false + }, + [ + "SRR11140744.fastq.gz:md5,96a821b351f3a450889a38e7099f7b95", + "SRR11140744_1.fastq.gz:md5,8573015c91d099b6e30789f8bab2f43c", + "SRR11140744_2.fastq.gz:md5,37e6f719a022dc3c9994c80fbc20c311" + ] + ], + [ + { + "id": "test_single_end", + "single_end": true + }, + "DRR000774.fastq.gz:md5,a110f93f7a9b0271455f5a435bce73c7" + ] + ], + "versions": [ + "versions.yml:md5,1a2218ff913fc33408bffccb081b5048", + "versions.yml:md5,98d78bba9f3da39a0b7db6e9c7dcc224", + "versions.yml:md5,9c558ff624585a6eee82a19c8c0136db" + ] + } + ], + "timestamp": "2024-01-25T22:06:20.7303705" + } +} \ No newline at end of file diff --git a/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/tags.yml b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/tags.yml new file mode 100644 index 0000000..ab06450 --- /dev/null +++ b/tests/workflows/fastq_download_prefetch_fasterqdump_sratools/tags.yml @@ -0,0 +1,2 @@ +fastq_download_prefetch_fasterqdump_sratools: + - subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/** diff --git a/workflows/iridanextexample.nf b/workflows/fetchdatairidanext.nf similarity index 62% rename from workflows/iridanextexample.nf rename to workflows/fetchdatairidanext.nf index dc30303..74b5fcc 100644 --- a/workflows/iridanextexample.nf +++ b/workflows/fetchdatairidanext.nf @@ -13,7 +13,7 @@ def summary_params = paramsSummaryMap(workflow) // Print parameter summary log to screen log.info logo + paramsSummaryLog(workflow) + citation -WorkflowIridanextexample.initialise(params, log) +WorkflowFetchdatairidanext.initialise(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -30,12 +30,7 @@ WorkflowIridanextexample.initialise(params, log) // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { GENERATE_SAMPLE_JSON } from '../modules/local/generatesamplejson/main' -include { SIMPLIFY_IRIDA_JSON } from '../modules/local/simplifyiridajson/main' -include { IRIDA_NEXT_OUTPUT } from '../modules/local/iridanextoutput/main' -include { ASSEMBLY_STUB } from '../modules/local/assemblystub/main' -include { GENERATE_SUMMARY } from '../modules/local/generatesummary/main' +include { FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS } from '../subworkflows/local/fastq_download_prefetch_fasterqdump_sratools' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -54,47 +49,20 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -workflow IRIDANEXT { +workflow FETCHDATAIRIDANEXT { ch_versions = Channel.empty() // Create a new channel of metadata from a sample sheet // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") - // Map the inputs so that they conform to the nf-core-expected "reads" format. - // Either [meta, [fastq_1]] or [meta, [fastq_1, fastq_2]] if fastq_2 exists - .map { meta, fastq_1, fastq_2 -> - fastq_2 ? tuple(meta, [ file(fastq_1), file(fastq_2) ]) : - tuple(meta, [ file(fastq_1) ])} - - ASSEMBLY_STUB ( - input - ) - ch_versions = ch_versions.mix(ASSEMBLY_STUB.out.versions) - - // A channel of tuples of ({meta}, [read[0], read[1]], assembly) - ch_tuple_read_assembly = input.join(ASSEMBLY_STUB.out.assembly) - - GENERATE_SAMPLE_JSON ( - ch_tuple_read_assembly - ) - ch_versions = ch_versions.mix(GENERATE_SAMPLE_JSON.out.versions) - - GENERATE_SUMMARY ( - ch_tuple_read_assembly.collect{ [it] } - ) - ch_versions = ch_versions.mix(GENERATE_SUMMARY.out.versions) - - SIMPLIFY_IRIDA_JSON ( - GENERATE_SAMPLE_JSON.out.json - ) - ch_versions = ch_versions.mix(SIMPLIFY_IRIDA_JSON.out.versions) - ch_simplified_jsons = SIMPLIFY_IRIDA_JSON.out.simple_json.map { meta, data -> data }.collect() // Collect JSONs + meta_accessions = input.map {meta -> tuple(["id": meta.id.first()], meta.insdc_accession.first())} - IRIDA_NEXT_OUTPUT ( - samples_data=ch_simplified_jsons + FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( + ch_sra_ids = meta_accessions, + ch_dbgap_key = [] ) - ch_versions = ch_versions.mix(IRIDA_NEXT_OUTPUT.out.versions) + ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml')