From c4934d9869ff7dd8c3cf0505cbe013a397759a96 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Thu, 23 Sep 2021 15:35:14 +1200 Subject: [PATCH] Move (nextstrain) alignment into separate profiles This breaks each of our main nextstrain profiles into two profiles each - one to perform alignment (etc) and a second to run the phylogenetics. This is part of a wider effort to generate alignments as soon as new sequences are available. Using GISAID as an example (also implemented for open, read the following but replace `gisaid` with `open` ): The first profile `nextstrain_profiles/nextstrain-gisaid-preprocess` takes sequences and metadata and produces three files which we upload to S3, `results/filtered_gisaid.fasta.xz`, `results/masked_gisaid.fasta.xz` and `results/aligned_gisaid.fasta.xz`. While this is a separate profile, the `builds.yaml` file is around a dozen lines of code. I chose this approach rather than using another `--configfile` in the existing profile as the way Snakemake overlays configs (or doesn't) isn't intuitive, and stubby profiles are easier to reason with. The second profile, `nextstrain_profiles/nextstrain-gisaid` is relatively unchanged, except that we now start from `results/filtered_gisaid.fasta.xz` and thus should be much faster to run. Note that the `upload` rule here will no longer upload the files which are now within the previous profile. It is still possible to start this workflow from (unaligned) sequences, however there should be no reason to do so. The sets of uploaded files are defined by config["upload"]. This allows for a profile to be created which uploads both preprocessing files and build files, if desired. An introduction to the different profiles, and exact commands to run each profile have been added to docs/dev_docs.md. Note that both profiles will run `rule sanitize_metadata`, as each depend on its output which is not uploaded as an intermediate file. None of these changes should affect non-nextstrain-core builds / profiles. --- docs/dev_docs.md | 61 ++++++++++++++++++- .../nextstrain-gisaid-preprocess/builds.yaml | 22 +++++++ .../nextstrain-gisaid-preprocess/config.yaml | 10 +++ .../nextstrain-gisaid/builds.yaml | 15 ++--- .../nextstrain-open-preprocess/builds.yaml | 22 +++++++ .../nextstrain-open-preprocess/config.yaml | 10 +++ .../nextstrain-open/builds.yaml | 15 ++--- workflow/schemas/config.schema.yaml | 9 +++ workflow/snakemake_rules/common.smk | 16 ++++- .../snakemake_rules/export_for_nextstrain.smk | 5 +- 10 files changed, 160 insertions(+), 25 deletions(-) create mode 100644 nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml create mode 100644 nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml create mode 100644 nextstrain_profiles/nextstrain-open-preprocess/builds.yaml create mode 100644 nextstrain_profiles/nextstrain-open-preprocess/config.yaml diff --git a/docs/dev_docs.md b/docs/dev_docs.md index 4f45535a3..0c61bef9e 100644 --- a/docs/dev_docs.md +++ b/docs/dev_docs.md @@ -68,13 +68,70 @@ Prior to merging a pull request that introduces a new backward incompatible chan We do not release new minor versions for new features, but you should document new features in the change log as part of the corresponding pull request under a heading for the date those features are merged. +## Running Core Nextstrain Builds + +The "core" nextstrain builds consist of a global analysis and six regional analyses, performed independently for GISAID data and open data (currently open data is GenBank data). +Stepping back, the process can be broken into three steps: +1. Ingest and curation of raw data. This is performed by the [ncov-ingest](https://github.com/nextstrain/ncov-ingest/) repo and resulting files are uploaded to S3 buckets. +2. Preprocessing of data (alignment, masking and QC filtering). This is performed by the profiles `nextstrain_profiles/nextstrain-open-preprocess` and `nextstrain_profiles/nextstrain-gisaid-preprocess`. The resulting files are uploaded to S3 buckets by the `upload` rule. +3. Phylogenetic builds, which start from the files produced by the previous step. This is performed by the profiles `nextstrain_profiles/nextstrain-open` and `nextstrain_profiles/nextstrain-gisaid`. The resulting files are uploaded to S3 buckets by the `upload` rule. + + +### Manually running preprocessing + +To run these pipelines without uploading the results: +```sh +snakemake -pf results/filtered_open.fasta.xz --profile nextstrain_profiles/nextstrain-open-preprocess +snakemake -pf results/filtered_gisaid.fasta.xz --profile nextstrain_profiles/nextstrain-gisaid-preprocess +``` + +If you wish to upload the resulting information, you should run the `upload` rule. +Optionally, you may wish to define a specific `S3_DST_BUCKET` to avoid overwriting the files already present on the S3 buckets: +```sh +snakemake -pf upload --profile nextstrain_profiles/nextstrain-open-preprocess \ + --config S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/TRIAL_NAME +snakemake -pf upload --profile nextstrain_profiles/nextstrain-gisaid-preprocess \ + --config S3_DST_BUCKET=nextstrain-ncov-private/trial/TRIAL_NAME +``` + +### Manually running phylogenetic builds + +To run these pipelines locally, without uploading the results: +```sh +snakemake -pf all --profile nextstrain_profiles/nextstrain-open +snakemake -pf all --profile nextstrain_profiles/nextstrain-gisaid +``` +You can replace `all` with, for instance, `auspice/ncov_open_global.json` to avoid building all regions. +The resulting dataset(s) can be visualised in the browser by running `auspice view --datasetDir auspice`. + +If you wish to upload the resulting information, you should run the `upload` and/or `deploy` rules. +The `upload` rule uploads the resulting files, including intermediate files, to specific S3 buckets; this rule uses the `S3_DST_BUCKET` config parameter. +The `deploy` rule uploads the dataset files such that they are accessible via nextstrain URLs (e.g. nextstrain.org/ncov/gisaid/global); this rule uses the `deploy_url` and `auspice_json_prefix` parameters. +You may wish to overwrite these parameters for your local runs to avoid overwriting data which is already present. +For instance, here are the commands used by the trial builds action (see below): +```sh +snakemake -pf upload deploy \ + --profile nextstrain_profiles/nextstrain-open-preprocess \ + --config \ + S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/TRIAL_NAME \ + deploy_url=s3://nextstrain-staging/ \ + auspice_json_prefix=ncov_open_trial_TRIAL_NAME +snakemake -pf upload deploy \ + --profile nextstrain_profiles/nextstrain-gisaid-preprocess \ + --config \ + S3_DST_BUCKET=nextstrain-ncov-private/trial/TRIAL_NAME \ + deploy_url=s3://nextstrain-staging/ \ + auspice_json_prefix=ncov_gisaid_trial_TRIAL_NAME +``` + + ## Triggering routine builds Typically, everything’s triggered from the `ncov-ingest` pipeline’s `trigger` command. -After updating the intermediate files, that command will run this `ncov` pipeline force-requiring the rules `deploy` and `upload`. +After updating the intermediate files, that command will run the phylogenetic `ncov` pipelines (step 3, above) force-requiring the rules `deploy` and `upload`. ## Triggering trial builds -This repository contains a GitHub Action `trial-build` which is manually run [via github.com](https://github.com/nextstrain/ncov/actions/workflows/trial-build.yml). +This repository contains a GitHub Action `trial-build` which is manually run [via github.com](https://github.com/nextstrain/ncov/actions/workflows/trial-build.yml) and runs both of the phylogenetic build pipelines (gisaid + open). This will ask for a “trial name” and upload intermediate files to `nextstrain-ncov-private/trial/$TRIAL_NAME` and `nextstrain-staging/files/ncov/open/trial/$TRIAL_NAME`. Auspice datasets for visualisation will be available at `https://nextstrain.org/staging/ncov/gisaid/trial/$TRIAL_NAME/$BUILD_NAME` and `https://nextstrain.org/staging/ncov/open/trial/$TRIAL_NAME/$BUILD_NAME`. diff --git a/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml new file mode 100644 index 000000000..00f6715c1 --- /dev/null +++ b/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml @@ -0,0 +1,22 @@ +custom_rules: + - workflow/snakemake_rules/export_for_nextstrain.smk + +# These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. +# To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config. +# P.S. These are intentionally set as top-level keys as this allows command-line overrides. +S3_DST_BUCKET: "nextstrain-ncov-private" +S3_DST_COMPRESSION: "xz" +S3_DST_ORIGINS: ["gisaid"] + +upload: + - preprocessing-files + +inputs: + - name: gisaid + metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" + sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz" + +# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds +deploy_url: s3://nextstrain-data +slack_token: ~ +slack_channel: "#ncov-gisaid-updates" \ No newline at end of file diff --git a/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml b/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml new file mode 100644 index 000000000..cec6981d6 --- /dev/null +++ b/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml @@ -0,0 +1,10 @@ +configfile: + - defaults/parameters.yaml + - nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml + +keep-going: True +printshellcmds: True +show-failed-logs: True +restart-times: 1 +reason: True +stats: stats.json diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 190856a75..92f44df26 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -11,22 +11,19 @@ custom_rules: S3_DST_BUCKET: "nextstrain-ncov-private" S3_DST_COMPRESSION: "xz" S3_DST_ORIGINS: ["gisaid"] +upload: + - build-files genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] use_nextalign: true -# NOTE for shepherds -- there are commented out inputs here, you can -# uncomment them to start the pipeline at that stage. -# E.g. if you uncomment `filtered` then the pipeline -# will start by downloading that file and proceeding straight to -# subsampling +# Note: we have a separate profile for aligning GISAID sequences. This is triggered +# as soon as new sequences are available. This workflow is thus intended to be +# started from the filtered alignment. james, sept 2021 inputs: - name: gisaid metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" - sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz" - # aligned: "s3://nextstrain-ncov-private/aligned.fasta.xz" - # masked: "s3://nextstrain-ncov-private/masked.fasta.xz" - # filtered: "s3://nextstrain-ncov-private/filtered.fasta.xz" + filtered: "s3://nextstrain-ncov-private/filtered.fasta.xz" # Define locations for which builds should be created. # For each build we specify a subsampling scheme via an explicit key. diff --git a/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml b/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml new file mode 100644 index 000000000..88381f2e2 --- /dev/null +++ b/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml @@ -0,0 +1,22 @@ +custom_rules: + - workflow/snakemake_rules/export_for_nextstrain.smk + +# These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. +# To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config. +# P.S. These are intentionally set as top-level keys as this allows command-line overrides. +S3_DST_BUCKET: "nextstrain-data/files/ncov/open" +S3_DST_COMPRESSION: "xz" +S3_DST_ORIGINS: ["open"] + +upload: + - preprocessing-files + +inputs: + - name: open + metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.gz" + sequences: "s3://nextstrain-data/files/ncov/open/sequences.fasta.xz" + +# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds +deploy_url: s3://nextstrain-data +slack_token: ~ +slack_channel: "#ncov-genbank-updates" \ No newline at end of file diff --git a/nextstrain_profiles/nextstrain-open-preprocess/config.yaml b/nextstrain_profiles/nextstrain-open-preprocess/config.yaml new file mode 100644 index 000000000..22d00a1e6 --- /dev/null +++ b/nextstrain_profiles/nextstrain-open-preprocess/config.yaml @@ -0,0 +1,10 @@ +configfile: + - defaults/parameters.yaml + - nextstrain_profiles/nextstrain-open-preprocess/builds.yaml + +keep-going: True +printshellcmds: True +show-failed-logs: True +restart-times: 1 +reason: True +stats: stats.json diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index 0e1bacf18..a1e03a50f 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -11,19 +11,16 @@ custom_rules: S3_DST_BUCKET: "nextstrain-data/files/ncov/open" S3_DST_COMPRESSION: "xz" S3_DST_ORIGINS: ["open"] +upload: + - build-files -# NOTE for shepherds -- there are commented out inputs here, you can -# uncomment them to start the pipeline at that stage. -# E.g. if you uncomment `filtered` then the pipeline -# will start by downloading that file and proceeding straight to -# subsampling +# Note: we have a separate profile for aligning open sequences. This is triggered +# as soon as new sequences are available. This workflow is thus intended to be +# started from the filtered alignment. james, sept 2021 inputs: - name: open metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.gz" - sequences: "s3://nextstrain-data/files/ncov/open/sequences.fasta.xz" - # aligned: "s3://nextstrain-data/files/ncov/open/aligned.fasta.xz" - # masked: "s3://nextstrain-data/files/ncov/open/masked.fasta.xz" - # filtered: "s3://nextstrain-data/files/ncov/open/filtered.fasta.xz" + filtered: "s3://nextstrain-data/files/ncov/open/filtered.fasta.xz" # Define locations for which builds should be created. # For each build we specify a subsampling scheme via an explicit key. diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 6db838cc6..4873c39e9 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -61,3 +61,12 @@ properties: type: string # A similar pattern is used in the workflow's wildcard constraints. pattern: "^[a-zA-Z0-9-]+$" + + upload: + type: array + minItems: 1 + items: + type: string + enum: + - preprocessing-files + - build-files \ No newline at end of file diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index 5130a7158..89c86b1b2 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -152,15 +152,16 @@ def _get_upload_inputs(wildcards): origin = config["S3_DST_ORIGINS"][0] # mapping of remote → local filenames - uploads = { + preprocessing_files = { f"aligned.fasta.xz": f"results/aligned_{origin}.fasta.xz", # from `rule align` f"masked.fasta.xz": f"results/masked_{origin}.fasta.xz", # from `rule mask` f"filtered.fasta.xz": f"results/filtered_{origin}.fasta.xz", # from `rule filter` f"mutation-summary.tsv.xz": f"results/mutation_summary_{origin}.tsv.xz", # from `rule mutation_summary` } + build_files = {} for build_name in config["builds"]: - uploads.update({ + build_files.update({ f"{build_name}/sequences.fasta.xz": f"results/{build_name}/{build_name}_subsampled_sequences.fasta.xz", # from `rule combine_samples` f"{build_name}/metadata.tsv.xz": f"results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", # from `rule combine_samples` f"{build_name}/aligned.fasta.xz": f"results/{build_name}/aligned.fasta.xz", # from `rule build_align` @@ -170,4 +171,13 @@ def _get_upload_inputs(wildcards): f"{build_name}/{build_name}_root-sequence.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_root-sequence.json" }) - return uploads + req_upload = config.get("upload", []) + if "preprocessing-files" in req_upload and "build-files" in req_upload: + return {**preprocessing_files, **build_files} + elif "preprocessing-files" in req_upload: + return preprocessing_files + elif "build-files" in req_upload: + return build_files + else: + raise Exception("The upload rule requires an 'upload' parameter in the config.") + diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk index dad5f5ca0..7bba71312 100644 --- a/workflow/snakemake_rules/export_for_nextstrain.smk +++ b/workflow/snakemake_rules/export_for_nextstrain.smk @@ -146,6 +146,7 @@ from os import environ SLACK_TOKEN = environ["SLACK_TOKEN"] = config["slack_token"] or "" SLACK_CHANNEL = environ["SLACK_CHANNEL"] = config["slack_channel"] or "" +BUILD_DESCRIPTION = f"Build to upload {' & '.join(config.get('upload', ['nothing']))}" try: deploy_origin = ( @@ -197,7 +198,7 @@ rule upload: shell("./scripts/upload-to-s3 {local:q} s3://{params.s3_bucket:q}/{remote:q} | tee -a {log:q}") onstart: - slack_message = f"Build {deploy_origin} started." + slack_message = f"{BUILD_DESCRIPTION} {deploy_origin} started." if SLACK_TOKEN and SLACK_CHANNEL: shell(f""" @@ -210,7 +211,7 @@ onstart: """) onerror: - slack_message = f"Build {deploy_origin} failed." + slack_message = f"{BUILD_DESCRIPTION} {deploy_origin} failed." if SLACK_TOKEN and SLACK_CHANNEL: shell(f"""