From c4934d9869ff7dd8c3cf0505cbe013a397759a96 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Thu, 23 Sep 2021 15:35:14 +1200
Subject: [PATCH] Move (nextstrain) alignment into separate profiles

This breaks each of our main nextstrain profiles into two profiles each
- one to perform alignment (etc) and a second to run the phylogenetics.
This is part of a wider effort to generate alignments as soon as new
sequences are available.

Using GISAID as an example (also implemented for open, read the
following but replace `gisaid` with `open` ):

The first profile `nextstrain_profiles/nextstrain-gisaid-preprocess`
takes sequences and metadata and produces three files which we upload
to S3, `results/filtered_gisaid.fasta.xz`,
`results/masked_gisaid.fasta.xz` and `results/aligned_gisaid.fasta.xz`.
While this is a separate profile, the `builds.yaml` file is around a
dozen lines of code. I chose this approach rather than using another
`--configfile` in the existing profile as the way Snakemake overlays
configs (or doesn't) isn't intuitive, and stubby profiles are easier to
reason with.

The second profile, `nextstrain_profiles/nextstrain-gisaid` is
relatively unchanged, except that we now start from
`results/filtered_gisaid.fasta.xz` and thus should be much faster to
run. Note that the `upload`  rule here will no longer upload the files
which are now within the previous profile. It is still possible to start
this workflow from (unaligned) sequences, however there should be no
reason to do so.

The sets of uploaded files are defined by config["upload"]. This allows
for a profile to be created which uploads both preprocessing files
and build files, if desired.

An introduction to the different profiles, and exact commands to run
each profile have been added to docs/dev_docs.md.

Note that both profiles will run `rule sanitize_metadata`, as each
depend on its output which is not uploaded as an intermediate file.

None of these changes should affect non-nextstrain-core builds /
profiles.
---
 docs/dev_docs.md                              | 61 ++++++++++++++++++-
 .../nextstrain-gisaid-preprocess/builds.yaml  | 22 +++++++
 .../nextstrain-gisaid-preprocess/config.yaml  | 10 +++
 .../nextstrain-gisaid/builds.yaml             | 15 ++---
 .../nextstrain-open-preprocess/builds.yaml    | 22 +++++++
 .../nextstrain-open-preprocess/config.yaml    | 10 +++
 .../nextstrain-open/builds.yaml               | 15 ++---
 workflow/schemas/config.schema.yaml           |  9 +++
 workflow/snakemake_rules/common.smk           | 16 ++++-
 .../snakemake_rules/export_for_nextstrain.smk |  5 +-
 10 files changed, 160 insertions(+), 25 deletions(-)
 create mode 100644 nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml
 create mode 100644 nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml
 create mode 100644 nextstrain_profiles/nextstrain-open-preprocess/builds.yaml
 create mode 100644 nextstrain_profiles/nextstrain-open-preprocess/config.yaml

diff --git a/docs/dev_docs.md b/docs/dev_docs.md
index 4f45535a3..0c61bef9e 100644
--- a/docs/dev_docs.md
+++ b/docs/dev_docs.md
@@ -68,13 +68,70 @@ Prior to merging a pull request that introduces a new backward incompatible chan
 We do not release new minor versions for new features, but you should document new features in the change log as part of the corresponding pull request under a heading for the date those features are merged.
 
 
+## Running Core Nextstrain Builds
+
+The "core" nextstrain builds consist of a global analysis and six regional analyses, performed independently for GISAID data and open data (currently open data is GenBank data).
+Stepping back, the process can be broken into three steps:
+1. Ingest and curation of raw data. This is performed by the [ncov-ingest](https://github.com/nextstrain/ncov-ingest/) repo and resulting files are uploaded to S3 buckets.
+2. Preprocessing of data (alignment, masking and QC filtering). This is performed by the profiles `nextstrain_profiles/nextstrain-open-preprocess` and `nextstrain_profiles/nextstrain-gisaid-preprocess`. The resulting files are uploaded to S3 buckets by the `upload` rule.
+3. Phylogenetic builds, which start from the files produced by the previous step. This is performed by the profiles `nextstrain_profiles/nextstrain-open` and `nextstrain_profiles/nextstrain-gisaid`. The resulting files are uploaded to S3 buckets by the `upload` rule.
+
+
+### Manually running preprocessing
+
+To run these pipelines without uploading the results:
+```sh
+snakemake -pf results/filtered_open.fasta.xz --profile nextstrain_profiles/nextstrain-open-preprocess
+snakemake -pf results/filtered_gisaid.fasta.xz --profile nextstrain_profiles/nextstrain-gisaid-preprocess
+```
+
+If you wish to upload the resulting information, you should run the `upload` rule.
+Optionally, you may wish to define a specific `S3_DST_BUCKET` to avoid overwriting the files already present on the S3 buckets:
+```sh
+snakemake -pf upload --profile nextstrain_profiles/nextstrain-open-preprocess \
+    --config S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/TRIAL_NAME
+snakemake -pf upload --profile nextstrain_profiles/nextstrain-gisaid-preprocess \
+    --config S3_DST_BUCKET=nextstrain-ncov-private/trial/TRIAL_NAME
+```
+
+### Manually running phylogenetic builds
+
+To run these pipelines locally, without uploading the results:
+```sh
+snakemake -pf all --profile nextstrain_profiles/nextstrain-open
+snakemake -pf all --profile nextstrain_profiles/nextstrain-gisaid
+```
+You can replace `all` with, for instance, `auspice/ncov_open_global.json` to avoid building all regions.
+The resulting dataset(s) can be visualised in the browser by running `auspice view --datasetDir auspice`.
+
+If you wish to upload the resulting information, you should run the `upload` and/or `deploy` rules.
+The `upload` rule uploads the resulting files, including intermediate files, to specific S3 buckets; this rule uses the `S3_DST_BUCKET` config parameter.
+The `deploy` rule uploads the dataset files such that they are accessible via nextstrain URLs (e.g. nextstrain.org/ncov/gisaid/global); this rule uses the `deploy_url` and `auspice_json_prefix` parameters.
+You may wish to overwrite these parameters for your local runs to avoid overwriting data which is already present.
+For instance, here are the commands used by the trial builds action (see below):
+```sh
+snakemake -pf upload deploy \
+    --profile nextstrain_profiles/nextstrain-open-preprocess \
+    --config \
+        S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/TRIAL_NAME \
+        deploy_url=s3://nextstrain-staging/ \
+        auspice_json_prefix=ncov_open_trial_TRIAL_NAME
+snakemake -pf upload deploy \
+    --profile nextstrain_profiles/nextstrain-gisaid-preprocess \
+    --config \
+        S3_DST_BUCKET=nextstrain-ncov-private/trial/TRIAL_NAME \
+        deploy_url=s3://nextstrain-staging/ \
+        auspice_json_prefix=ncov_gisaid_trial_TRIAL_NAME
+```
+
+
 ## Triggering routine builds
 
 Typically, everything’s triggered from the  `ncov-ingest` pipeline’s `trigger` command.
-After updating the intermediate files, that command will run this `ncov` pipeline force-requiring the rules `deploy` and `upload`.
+After updating the intermediate files, that command will run the phylogenetic `ncov` pipelines (step 3, above) force-requiring the rules `deploy` and `upload`.
 
 ## Triggering trial builds
 
-This repository contains a GitHub Action `trial-build` which is manually run [via github.com](https://github.com/nextstrain/ncov/actions/workflows/trial-build.yml).
+This repository contains a GitHub Action `trial-build` which is manually run [via github.com](https://github.com/nextstrain/ncov/actions/workflows/trial-build.yml) and runs both of the phylogenetic build pipelines (gisaid + open).
 This will ask for a “trial name” and upload intermediate files to  `nextstrain-ncov-private/trial/$TRIAL_NAME` and `nextstrain-staging/files/ncov/open/trial/$TRIAL_NAME`.
 Auspice datasets for visualisation will be available at `https://nextstrain.org/staging/ncov/gisaid/trial/$TRIAL_NAME/$BUILD_NAME` and `https://nextstrain.org/staging/ncov/open/trial/$TRIAL_NAME/$BUILD_NAME`.
diff --git a/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml
new file mode 100644
index 000000000..00f6715c1
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml
@@ -0,0 +1,22 @@
+custom_rules:
+  - workflow/snakemake_rules/export_for_nextstrain.smk
+
+# These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified.
+# To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config.
+# P.S. These are intentionally set as top-level keys as this allows command-line overrides.
+S3_DST_BUCKET: "nextstrain-ncov-private"
+S3_DST_COMPRESSION: "xz"
+S3_DST_ORIGINS: ["gisaid"]
+
+upload:
+  - preprocessing-files
+
+inputs:
+  - name: gisaid
+    metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz"
+    sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz"
+
+# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds
+deploy_url: s3://nextstrain-data
+slack_token: ~
+slack_channel: "#ncov-gisaid-updates"
\ No newline at end of file
diff --git a/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml b/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml
new file mode 100644
index 000000000..cec6981d6
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml
@@ -0,0 +1,10 @@
+configfile:
+  - defaults/parameters.yaml
+  - nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml
+
+keep-going: True
+printshellcmds: True
+show-failed-logs: True
+restart-times: 1
+reason: True
+stats: stats.json
diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
index 190856a75..92f44df26 100644
--- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -11,22 +11,19 @@ custom_rules:
 S3_DST_BUCKET: "nextstrain-ncov-private"
 S3_DST_COMPRESSION: "xz"
 S3_DST_ORIGINS: ["gisaid"]
+upload:
+  - build-files
 
 genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"]
 use_nextalign: true
 
-# NOTE for shepherds -- there are commented out inputs here, you can
-# uncomment them to start the pipeline at that stage.
-# E.g. if you uncomment `filtered` then the pipeline
-# will start by downloading that file and proceeding straight to
-# subsampling
+# Note: we have a separate profile for aligning GISAID sequences. This is triggered
+# as soon as new sequences are available. This workflow is thus intended to be
+# started from the filtered alignment.                        james, sept 2021
 inputs:
   - name: gisaid
     metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz"
-    sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz"
-    # aligned: "s3://nextstrain-ncov-private/aligned.fasta.xz"
-    # masked: "s3://nextstrain-ncov-private/masked.fasta.xz"
-    # filtered: "s3://nextstrain-ncov-private/filtered.fasta.xz"
+    filtered: "s3://nextstrain-ncov-private/filtered.fasta.xz"
 
 # Define locations for which builds should be created.
 # For each build we specify a subsampling scheme via an explicit key.
diff --git a/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml b/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml
new file mode 100644
index 000000000..88381f2e2
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml
@@ -0,0 +1,22 @@
+custom_rules:
+  - workflow/snakemake_rules/export_for_nextstrain.smk
+
+# These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified.
+# To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config.
+# P.S. These are intentionally set as top-level keys as this allows command-line overrides.
+S3_DST_BUCKET: "nextstrain-data/files/ncov/open"
+S3_DST_COMPRESSION: "xz"
+S3_DST_ORIGINS: ["open"]
+
+upload:
+  - preprocessing-files
+
+inputs:
+  - name: open
+    metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.gz"
+    sequences: "s3://nextstrain-data/files/ncov/open/sequences.fasta.xz"
+
+# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds
+deploy_url: s3://nextstrain-data
+slack_token: ~
+slack_channel: "#ncov-genbank-updates"
\ No newline at end of file
diff --git a/nextstrain_profiles/nextstrain-open-preprocess/config.yaml b/nextstrain_profiles/nextstrain-open-preprocess/config.yaml
new file mode 100644
index 000000000..22d00a1e6
--- /dev/null
+++ b/nextstrain_profiles/nextstrain-open-preprocess/config.yaml
@@ -0,0 +1,10 @@
+configfile:
+  - defaults/parameters.yaml
+  - nextstrain_profiles/nextstrain-open-preprocess/builds.yaml
+
+keep-going: True
+printshellcmds: True
+show-failed-logs: True
+restart-times: 1
+reason: True
+stats: stats.json
diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml
index 0e1bacf18..a1e03a50f 100644
--- a/nextstrain_profiles/nextstrain-open/builds.yaml
+++ b/nextstrain_profiles/nextstrain-open/builds.yaml
@@ -11,19 +11,16 @@ custom_rules:
 S3_DST_BUCKET: "nextstrain-data/files/ncov/open"
 S3_DST_COMPRESSION: "xz"
 S3_DST_ORIGINS: ["open"]
+upload:
+  - build-files
 
-# NOTE for shepherds -- there are commented out inputs here, you can
-# uncomment them to start the pipeline at that stage.
-# E.g. if you uncomment `filtered` then the pipeline
-# will start by downloading that file and proceeding straight to
-# subsampling
+# Note: we have a separate profile for aligning open sequences. This is triggered
+# as soon as new sequences are available. This workflow is thus intended to be
+# started from the filtered alignment.                        james, sept 2021
 inputs:
   - name: open
     metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.gz"
-    sequences: "s3://nextstrain-data/files/ncov/open/sequences.fasta.xz"
-    # aligned: "s3://nextstrain-data/files/ncov/open/aligned.fasta.xz"
-    # masked: "s3://nextstrain-data/files/ncov/open/masked.fasta.xz"
-    # filtered: "s3://nextstrain-data/files/ncov/open/filtered.fasta.xz"
+    filtered: "s3://nextstrain-data/files/ncov/open/filtered.fasta.xz"
 
 # Define locations for which builds should be created.
 # For each build we specify a subsampling scheme via an explicit key.
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 6db838cc6..4873c39e9 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -61,3 +61,12 @@ properties:
       type: string
       # A similar pattern is used in the workflow's wildcard constraints.
       pattern: "^[a-zA-Z0-9-]+$"
+
+  upload:
+    type: array
+    minItems: 1
+    items:
+      type: string
+      enum:
+        - preprocessing-files
+        - build-files
\ No newline at end of file
diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk
index 5130a7158..89c86b1b2 100644
--- a/workflow/snakemake_rules/common.smk
+++ b/workflow/snakemake_rules/common.smk
@@ -152,15 +152,16 @@ def _get_upload_inputs(wildcards):
     origin = config["S3_DST_ORIGINS"][0]
 
     # mapping of remote → local filenames
-    uploads = {
+    preprocessing_files = {
         f"aligned.fasta.xz":              f"results/aligned_{origin}.fasta.xz",              # from `rule align`
         f"masked.fasta.xz":               f"results/masked_{origin}.fasta.xz",               # from `rule mask`
         f"filtered.fasta.xz":             f"results/filtered_{origin}.fasta.xz",             # from `rule filter`
         f"mutation-summary.tsv.xz":       f"results/mutation_summary_{origin}.tsv.xz",       # from `rule mutation_summary`
     }
 
+    build_files = {}
     for build_name in config["builds"]:
-        uploads.update({
+        build_files.update({
             f"{build_name}/sequences.fasta.xz": f"results/{build_name}/{build_name}_subsampled_sequences.fasta.xz",   # from `rule combine_samples`
             f"{build_name}/metadata.tsv.xz":    f"results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",      # from `rule combine_samples`
             f"{build_name}/aligned.fasta.xz":   f"results/{build_name}/aligned.fasta.xz",                             # from `rule build_align`
@@ -170,4 +171,13 @@ def _get_upload_inputs(wildcards):
             f"{build_name}/{build_name}_root-sequence.json":    f"auspice/{config['auspice_json_prefix']}_{build_name}_root-sequence.json"
         })
 
-    return uploads
+    req_upload = config.get("upload", [])
+    if "preprocessing-files" in req_upload and "build-files" in req_upload:
+        return {**preprocessing_files, **build_files}
+    elif "preprocessing-files" in req_upload:
+        return preprocessing_files
+    elif "build-files" in req_upload:
+        return build_files
+    else:
+        raise Exception("The upload rule requires an 'upload' parameter in the config.")
+
diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk
index dad5f5ca0..7bba71312 100644
--- a/workflow/snakemake_rules/export_for_nextstrain.smk
+++ b/workflow/snakemake_rules/export_for_nextstrain.smk
@@ -146,6 +146,7 @@ from os import environ
 
 SLACK_TOKEN   = environ["SLACK_TOKEN"]   = config["slack_token"]   or ""
 SLACK_CHANNEL = environ["SLACK_CHANNEL"] = config["slack_channel"] or ""
+BUILD_DESCRIPTION = f"Build to upload {' & '.join(config.get('upload', ['nothing']))}"
 
 try:
     deploy_origin = (
@@ -197,7 +198,7 @@ rule upload:
             shell("./scripts/upload-to-s3 {local:q} s3://{params.s3_bucket:q}/{remote:q} | tee -a {log:q}")
 
 onstart:
-    slack_message = f"Build {deploy_origin} started."
+    slack_message = f"{BUILD_DESCRIPTION} {deploy_origin} started."
 
     if SLACK_TOKEN and SLACK_CHANNEL:
         shell(f"""
@@ -210,7 +211,7 @@ onstart:
         """)
 
 onerror:
-    slack_message = f"Build {deploy_origin} failed."
+    slack_message = f"{BUILD_DESCRIPTION} {deploy_origin} failed."
 
     if SLACK_TOKEN and SLACK_CHANNEL:
         shell(f"""