diff --git a/.env.example b/.env.example index 956931c18..0e6d51d58 100644 --- a/.env.example +++ b/.env.example @@ -29,9 +29,13 @@ SYNC_DESTINATION=123.456.789.123:~/nextclade # URL of Nextclade datasets server. See: https://github.com/neherlab/nextclade_data # Replace this with `http://localhost:27722` to use local data server instead -DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org +DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org/v3 # DATA_FULL_DOMAIN=http://localhost:27722 +# If enabled, Nextclade Web will first attempt to fetch datasets from the corresponding GitHub branch. If this attempt +# fails, it will use `DATA_FULL_DOMAIN` as usual. +DATA_TRY_GITHUB_BRANCH=0 + # Directory path (relative to the root of the project) from which local data server takes the data. # Useful for local testing on new datasets. See: https://github.com/neherlab/nextclade_data # It is recommended to keep the `nextclade_data` git repo in a sibling directory of `nextclade` git repo. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 71c9ea5cb..3ba13e0ce 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,2 +1 @@ -vers blank_issues_enabled: false diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml index 4121a10d3..bb8b1bc0f 100644 --- a/.github/workflows/cli.yml +++ b/.github/workflows/cli.yml @@ -57,17 +57,17 @@ jobs: - name: "Setup environment (release)" if: endsWith(github.ref, '/release-cli') run: | - echo "DATA_FULL_DOMAIN=https://data.clades.nextstrain.org" >> $GITHUB_ENV + echo "DATA_FULL_DOMAIN=https://data.clades.nextstrain.org/v3" >> $GITHUB_ENV - name: "Setup environment (staging)" if: endsWith(github.ref, '/staging-cli') run: | - echo "DATA_FULL_DOMAIN=https://data.staging.clades.nextstrain.org" >> $GITHUB_ENV + echo "DATA_FULL_DOMAIN=https://data.staging.clades.nextstrain.org/v3" >> $GITHUB_ENV - name: "Setup environment (master)" if: ${{ !endsWith(github.ref, '/staging-cli') && !endsWith(github.ref, '/release-cli') }} run: | - echo "DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org" >> $GITHUB_ENV + echo "DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org/v3" >> $GITHUB_ENV - name: "Checkout code" uses: actions/checkout@v3 @@ -106,7 +106,7 @@ jobs: run: | cp .env.example .env sed -i -e "s|OSXCROSS_URL=http://example.com/osxcross/osxcross.tar.xz|OSXCROSS_URL=${{ secrets.OSXCROSS_URL }}|g" .env - sed -i -e "s|DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org|DATA_FULL_DOMAIN=${DATA_FULL_DOMAIN}|g" .env + sed -i -e "s|DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org/v3|DATA_FULL_DOMAIN=${DATA_FULL_DOMAIN}|g" .env - name: "Login to Docker Hub" uses: docker/login-action@v2 @@ -170,7 +170,6 @@ jobs: run: | cp .env.example .env sed -i -e "s|OSXCROSS_URL=http://example.com/osxcross/osxcross.tar.xz|OSXCROSS_URL=${{ secrets.OSXCROSS_URL }}|g" .env - sed -i -e "s|DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org|DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org|g" .env - name: "Run unit tests" run: | @@ -217,7 +216,6 @@ jobs: run: | cp .env.example .env sed -i -e "s|OSXCROSS_URL=http://example.com/osxcross/osxcross.tar.xz|OSXCROSS_URL=${{ secrets.OSXCROSS_URL }}|g" .env - sed -i -e "s|DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org|DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org|g" .env - name: "Run lints" run: | diff --git a/.github/workflows/web.yml b/.github/workflows/web.yml index a61a80c57..062bdd39c 100644 --- a/.github/workflows/web.yml +++ b/.github/workflows/web.yml @@ -37,7 +37,7 @@ jobs: run: | echo "ENV_NAME=release" >> $GITHUB_ENV echo "FULL_DOMAIN=https://clades.nextstrain.org" >> $GITHUB_ENV - echo "DATA_FULL_DOMAIN=https://data.clades.nextstrain.org" >> $GITHUB_ENV + echo "DATA_FULL_DOMAIN=https://data.clades.nextstrain.org/v3" >> $GITHUB_ENV echo "PLAUSIBLE_IO_DOMAIN=clades.nextstrain.org" >> $GITHUB_ENV - name: "Setup environment (staging)" @@ -45,7 +45,7 @@ jobs: run: | echo "ENV_NAME=staging" >> $GITHUB_ENV echo "FULL_DOMAIN=https://staging.clades.nextstrain.org" >> $GITHUB_ENV - echo "DATA_FULL_DOMAIN=https://data.staging.clades.nextstrain.org" >> $GITHUB_ENV + echo "DATA_FULL_DOMAIN=https://data.staging.clades.nextstrain.org/v3" >> $GITHUB_ENV echo "PLAUSIBLE_IO_DOMAIN=staging.clades.nextstrain.org" >> $GITHUB_ENV - name: "Setup environment (master)" @@ -53,7 +53,7 @@ jobs: run: | echo "ENV_NAME=master" >> $GITHUB_ENV echo "FULL_DOMAIN=https://master.clades.nextstrain.org" >> $GITHUB_ENV - echo "DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org" >> $GITHUB_ENV + echo "DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org/v3" >> $GITHUB_ENV echo "PLAUSIBLE_IO_DOMAIN=master.clades.nextstrain.org" >> $GITHUB_ENV - name: "Checkout code" @@ -107,7 +107,7 @@ jobs: run: | cp .env.example .env sed -i -e "s|FULL_DOMAIN=autodetect|FULL_DOMAIN=${FULL_DOMAIN}|g" .env - sed -i -e "s|DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org|DATA_FULL_DOMAIN=${DATA_FULL_DOMAIN}|g" .env + sed -i -e "s|DATA_FULL_DOMAIN=https://data.master.clades.nextstrain.org/v3|DATA_FULL_DOMAIN=${DATA_FULL_DOMAIN}|g" .env - name: "Login to Docker Hub" uses: docker/login-action@v2 diff --git a/.readthedocs.yml b/.readthedocs.yml index e3c0f2ec2..3f6d5d02b 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,6 +1,11 @@ --- version: 2 +build: + os: "ubuntu-22.04" + tools: + python: "mambaforge-22.9" + conda: environment: docs/environment.yml diff --git a/Cargo.lock b/Cargo.lock index f21399925..482e7bd77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,16 +75,15 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.3.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "b1f58811cfac344940f1a400b6e6231ce35171f614f26439e80f8c1465c5cc0c" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", "utf8parse", ] @@ -114,9 +113,9 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd" dependencies = [ "anstyle", "windows-sys", @@ -469,51 +468,42 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.10" +version = "4.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "384e169cc618c613d5e3ca6404dda77a8685a63e08660dcc64abaf7da7cb0c7a" +checksum = "6a13b88d2c62ff462f88e4a121f17a82c1af05693a2f192b5c38d14de73c19f6" dependencies = [ "clap_builder", "clap_derive", - "once_cell", -] - -[[package]] -name = "clap-verbosity-flag" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1eef05769009513df2eb1c3b4613e7fad873a14c600ff025b08f250f59fee7de" -dependencies = [ - "clap", - "log", ] [[package]] name = "clap_builder" -version = "4.3.10" +version = "4.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef137bbe35aab78bdb468ccfba75a5f4d8321ae011d34063770780545176af2d" +checksum = "2bb9faaa7c2ef94b2743a21f5a29e6f0010dff4caa69ac8e9d6cf8b6fa74da08" dependencies = [ "anstream", "anstyle", "clap_lex", "strsim", + "unicase", + "unicode-width", ] [[package]] name = "clap_complete" -version = "4.3.1" +version = "4.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6b5c519bab3ea61843a7923d074b04245624bb84a64a8c150f5deb014e388b" +checksum = "4110a1e6af615a9e6d0a36f805d5c99099f8bab9b8042f5bc1fa220a4a89e36f" dependencies = [ "clap", ] [[package]] name = "clap_complete_fig" -version = "4.3.1" +version = "4.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99fee1d30a51305a6c2ed3fc5709be3c8af626c9c958e04dd9ae94e27bcbce9f" +checksum = "9e9bae21b3f6eb417ad3054c8b1094aa0542116eba4979b1b271baefbfa6b965" dependencies = [ "clap", "clap_complete", @@ -521,9 +511,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.3.2" +version = "4.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873" dependencies = [ "heck", "proc-macro2", @@ -1712,7 +1702,7 @@ dependencies = [ [[package]] name = "nextclade" -version = "2.14.0" +version = "3.0.0-alpha.0" dependencies = [ "assert2", "atty", @@ -1722,7 +1712,6 @@ dependencies = [ "bzip2", "chrono", "clap", - "clap-verbosity-flag", "clap_complete", "clap_complete_fig", "color-eyre", @@ -1748,6 +1737,7 @@ dependencies = [ "num-traits", "num_cpus", "optfield", + "ordered-float", "owo-colors", "pretty_assertions", "rayon", @@ -1761,6 +1751,7 @@ dependencies = [ "serde_repr", "serde_stacker", "serde_yaml", + "strsim", "strum 0.25.0", "strum_macros 0.25.0", "tinytemplate", @@ -1775,7 +1766,7 @@ dependencies = [ [[package]] name = "nextclade-cli" -version = "2.14.0" +version = "3.0.0-alpha.0" dependencies = [ "assert2", "clap", @@ -1796,6 +1787,7 @@ dependencies = [ "log", "nextclade", "num_cpus", + "ordered-float", "owo-colors", "pretty_assertions", "rayon", @@ -1809,15 +1801,17 @@ dependencies = [ "serde_json", "strum 0.25.0", "strum_macros 0.25.0", + "tinytemplate", "url", "zip", ] [[package]] name = "nextclade-web" -version = "2.14.0" +version = "3.0.0-alpha.0" dependencies = [ "assert2", + "chrono", "console_error_panic_hook", "eyre", "getrandom", @@ -1955,11 +1949,14 @@ dependencies = [ [[package]] name = "ordered-float" -version = "3.7.0" +version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213" +checksum = "2a54938017eacd63036332b4ae5c8a49fc8c0c1d6d629893057e4f13609edd06" dependencies = [ "num-traits", + "rand", + "schemars", + "serde", ] [[package]] @@ -2175,6 +2172,7 @@ dependencies = [ "libc", "rand_chacha", "rand_core", + "serde", ] [[package]] @@ -2194,6 +2192,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", + "serde", ] [[package]] @@ -2514,6 +2513,9 @@ name = "semver" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" +dependencies = [ + "serde", +] [[package]] name = "serde" @@ -3029,6 +3031,15 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +[[package]] +name = "unicase" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-bidi" version = "0.3.13" diff --git a/docker-dev b/docker-dev index 43da950ea..fc8d69e62 100755 --- a/docker-dev +++ b/docker-dev @@ -545,10 +545,8 @@ ${NICE} docker run --rm \ if [ -n "${CROSS:-}" ] && [ -n "${RELEASE:-}" ] && { [ "${BUILD:-}" == 1 ] || [ "${RUN:-}" == 1 ]; }; then mkdir -p .out/ if [[ "${CROSS}" == *windows* ]]; then - cp "${BUILD_DIR}/${CROSS}/release/nextalign.exe" ".out/nextalign-${CROSS}.exe" cp "${BUILD_DIR}/${CROSS}/release/nextclade.exe" ".out/nextclade-${CROSS}.exe" else - cp "${BUILD_DIR}/${CROSS}/release/nextalign" ".out/nextalign-${CROSS}" cp "${BUILD_DIR}/${CROSS}/release/nextclade" ".out/nextclade-${CROSS}" fi fi diff --git a/docker/docker-prod-alpine.dockerfile b/docker/docker-prod-alpine.dockerfile index 9fa498867..3117a3531 100644 --- a/docker/docker-prod-alpine.dockerfile +++ b/docker/docker-prod-alpine.dockerfile @@ -1,8 +1,8 @@ FROM alpine:3 COPY .out/nextclade-x86_64-unknown-linux-musl /usr/bin/nextclade -COPY .out/nextalign-x86_64-unknown-linux-musl /usr/bin/nextalign RUN set -eux \ +&& ln -s /usr/bin/nextclade /usr/bin/nextalign \ && ln -s /usr/bin/nextclade /nextclade \ && ln -s /usr/bin/nextalign /nextalign diff --git a/docker/docker-prod-debian.dockerfile b/docker/docker-prod-debian.dockerfile index 9e17b5b3b..4ba48d45f 100644 --- a/docker/docker-prod-debian.dockerfile +++ b/docker/docker-prod-debian.dockerfile @@ -1,9 +1,9 @@ FROM debian:11 COPY .out/nextclade-x86_64-unknown-linux-gnu /usr/bin/nextclade -COPY .out/nextalign-x86_64-unknown-linux-gnu /usr/bin/nextalign RUN set -eux \ +&& ln -s /usr/bin/nextclade /usr/bin/nextalign \ && ln -s /usr/bin/nextclade /nextclade \ && ln -s /usr/bin/nextalign /nextalign \ && export DEBIAN_FRONTEND=noninteractive \ diff --git a/docker/docker-prod-scratch.dockerfile b/docker/docker-prod-scratch.dockerfile index 17b3f08d4..94ed85b6d 100644 --- a/docker/docker-prod-scratch.dockerfile +++ b/docker/docker-prod-scratch.dockerfile @@ -1,4 +1,6 @@ FROM scratch COPY .out/nextclade-x86_64-unknown-linux-musl /nextclade -COPY .out/nextalign-x86_64-unknown-linux-musl /nextalign + +RUN set -eux \ +&& ln -s /usr/bin/nextclade /usr/bin/nextalign diff --git a/docs/user/input-files.md b/docs/user/input-files.md index 48a666efa..9c1acb0a1 100644 --- a/docs/user/input-files.md +++ b/docs/user/input-files.md @@ -247,13 +247,13 @@ Nextclade Web (simple and advanced modes): accepted in "Sequences" drag & drop b Nextclade CLI and Nextalign CLI accept fasta inputs as one or multiple positional arguments. Accepts plain or compressed FASTA files. If a compressed fasta file is provided, it will be transparently decompressed. Supported compression formats: `gz`, `bz2`, `xz`, `zstd`. Decompressor is chosen based on file extension. If there's multiple input files, then different files can have different compression formats. If positional arguments provided, the plain fasta input is read from standard input (stdin). -Accepted formats: [FASTA](https://en.wikipedia.org/wiki/FASTA_format) or plain text (one sequence per line). +Accepted formats: [FASTA](https://en.wikipedia.org/wiki/FASTA_format) ## Reference (root) sequence Viral nucleotide sequence which serves as a reference for alignment and the analysis. Mutations are called relative to the reference sequence. It is expected to be the root of the [reference tree](#reference-tree). The best results are obtained when the reference sequence is a well-known consensus genome, of a very high quality, preferably complete and unambiguous (spans entire genome and has no ambiguous nucleotides). -Accepted formats: [FASTA](https://en.wikipedia.org/wiki/FASTA_format) or plain text. The file is expected to contain only 1 sequence. +Accepted formats: [FASTA](https://en.wikipedia.org/wiki/FASTA_format) file containing exactly 1 sequence. Nextclade Web (advanced mode): accepted in "Root sequence" drag & drop box. A remote URL is also accepted in `input-root-sequence` URL parameter. diff --git a/packages_rs/nextclade-cli/Cargo.toml b/packages_rs/nextclade-cli/Cargo.toml index 36e87ce42..b82b3e730 100644 --- a/packages_rs/nextclade-cli/Cargo.toml +++ b/packages_rs/nextclade-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "nextclade-cli" -version = "2.14.0" +version = "3.0.0-alpha.0" description = "Alignment, mutation calling, phylogenetic placement, clade assignment and quality control checks for viral genetic sequences. CLI module." repository = "https://github.com/nextstrain/nextclade" documentation = "https://docs.nextstrain.org/projects/nextclade/en/stable/" @@ -11,9 +11,9 @@ publish = false [dependencies] assert2 = "=0.3.11" -clap = { version = "=4.3.10", features = ["derive"] } -clap_complete = "=4.3.1" -clap_complete_fig = "=4.3.1" +clap = { version = "=4.4.2", features = ["derive", "color", "unicode", "unstable-styles"] } +clap_complete = "=4.4.1" +clap_complete_fig = "=4.4.0" color-eyre = "=0.6.2" comfy-table = "=7.0.1" crossbeam = "=0.8.2" @@ -28,17 +28,19 @@ lazy_static = "=1.4.0" log = "=0.4.19" nextclade = { path = "../nextclade" } num_cpus = "=1.16.0" +ordered-float = { version = "=3.9.1", features = ["rand", "serde", "schemars"] } owo-colors = "=3.5.0" pretty_assertions = "=1.3.0" rayon = "=1.7.0" regex = "=1.8.4" reqwest = { version = "=0.11.18", default-features = false, features = ["blocking", "deflate", "gzip", "brotli", "socks", "rustls-tls"] } schemars = { version = "=0.8.12", features = ["chrono", "either", "enumset", "indexmap1"] } -semver = "=1.0.17" +semver = { version = "=1.0.17", features = ["serde"] } serde = { version = "=1.0.164", features = ["derive"] } serde_json = { version = "=1.0.99", features = ["preserve_order", "indexmap", "unbounded_depth"] } strum = "=0.25.0" strum_macros = "=0.25" +tinytemplate = "=1.2.1" url = { version = "=2.4.0", features = ["serde"] } zip = { version = "=0.6.6", default-features = false, features = ["aes-crypto", "bzip2", "deflate", "time"] } diff --git a/packages_rs/nextclade-cli/src/bin/featuretree.rs b/packages_rs/nextclade-cli/src/bin/featuretree.rs deleted file mode 100644 index 27ca1c590..000000000 --- a/packages_rs/nextclade-cli/src/bin/featuretree.rs +++ /dev/null @@ -1,57 +0,0 @@ -use clap::{Parser, ValueHint}; -use ctor::ctor; -use eyre::Report; -use log::LevelFilter; -use nextclade::features::feature_tree::FeatureTree; -use nextclade::io::json::{json_stringify, json_write, JsonPretty}; -use nextclade::io::yaml::yaml_write; -use nextclade::utils::global_init::global_init; -use nextclade::utils::global_init::setup_logger; -use std::fmt::Debug; -use std::path::PathBuf; - -#[ctor] -fn init() { - global_init(); -} - -#[derive(Parser, Debug)] -#[clap(name = "featuremap")] -#[clap(author, version)] -#[clap(verbatim_doc_comment)] -pub struct FeaturemapArgs { - /// Path to input GFF3 file - #[clap(value_hint = ValueHint::FilePath)] - #[clap(hide_long_help = true, hide_short_help = true)] - pub input_feature_map: PathBuf, - - /// Path to output file - #[clap(long, short = 'o')] - #[clap(value_hint = ValueHint::DirPath)] - pub output: Option, - - /// Print output in JSON format - #[clap(long)] - pub json: bool, -} - -fn main() -> Result<(), Report> { - let args = FeaturemapArgs::parse(); - setup_logger(LevelFilter::Warn); - let feature_tree = FeatureTree::from_gff3_file(args.input_feature_map)?; - - if let Some(output) = args.output { - if output.ends_with("yaml") || output.ends_with("yml") { - yaml_write(output, &feature_tree)?; - } else { - json_write(output, &feature_tree, JsonPretty(true))?; - } - } - - if args.json { - println!("{}\n", json_stringify(&feature_tree, JsonPretty(true))?); - } else { - println!("{}", &feature_tree.to_pretty_string()?); - } - Ok(()) -} diff --git a/packages_rs/nextclade-cli/src/bin/genemap.rs b/packages_rs/nextclade-cli/src/bin/genemap.rs deleted file mode 100644 index 27a53e0c9..000000000 --- a/packages_rs/nextclade-cli/src/bin/genemap.rs +++ /dev/null @@ -1,58 +0,0 @@ -use clap::{Parser, ValueHint}; -use ctor::ctor; -use eyre::Report; -use log::LevelFilter; -use nextclade::gene::gene_map::GeneMap; -use nextclade::gene::gene_map_display::gene_map_to_table_string; -use nextclade::io::json::{json_stringify, json_write, JsonPretty}; -use nextclade::io::yaml::yaml_write; -use nextclade::utils::global_init::global_init; -use nextclade::utils::global_init::setup_logger; -use std::fmt::Debug; -use std::path::PathBuf; - -#[ctor] -fn init() { - global_init(); -} - -#[derive(Parser, Debug)] -#[clap(name = "genemap")] -#[clap(author, version)] -#[clap(verbatim_doc_comment)] -pub struct GenemapArgs { - #[clap(value_hint = ValueHint::FilePath)] - #[clap(hide_long_help = true, hide_short_help = true)] - pub input_gene_map: PathBuf, - - /// Path to output file - #[clap(long, short = 'o')] - #[clap(value_hint = ValueHint::DirPath)] - pub output: Option, - - /// Print output in JSON format - #[clap(long)] - pub json: bool, -} - -fn main() -> Result<(), Report> { - let args = GenemapArgs::parse(); - setup_logger(LevelFilter::Warn); - let gene_map = GeneMap::from_file(args.input_gene_map)?; - - if let Some(output) = args.output { - if output.to_string_lossy().ends_with("yaml") || output.to_string_lossy().ends_with("yml") { - yaml_write(output, &gene_map)?; - } else { - json_write(output, &gene_map, JsonPretty(true))?; - } - } - - if args.json { - println!("{}\n", json_stringify(&gene_map, JsonPretty(true))?); - } else { - println!("{}", gene_map_to_table_string(&gene_map)?); - } - - Ok(()) -} diff --git a/packages_rs/nextclade-cli/src/bin/nextalign.rs b/packages_rs/nextclade-cli/src/bin/nextalign.rs deleted file mode 100644 index 7f23d05a6..000000000 --- a/packages_rs/nextclade-cli/src/bin/nextalign.rs +++ /dev/null @@ -1,13 +0,0 @@ -use ctor::ctor; -use eyre::Report; -use nextclade::utils::global_init::global_init; -use nextclade_cli::cli::nextalign_cli::nextalign_handle_cli_args; - -#[ctor] -fn init() { - global_init(); -} - -fn main() -> Result<(), Report> { - nextalign_handle_cli_args() -} diff --git a/packages_rs/nextclade-cli/src/cli/mod.rs b/packages_rs/nextclade-cli/src/cli/mod.rs index 99ee2c0ab..21edd855b 100644 --- a/packages_rs/nextclade-cli/src/cli/mod.rs +++ b/packages_rs/nextclade-cli/src/cli/mod.rs @@ -1,9 +1,8 @@ -pub mod nextalign_cli; -pub mod nextalign_loop; -pub mod nextalign_ordered_writer; pub mod nextclade_cli; pub mod nextclade_dataset_get; pub mod nextclade_dataset_list; pub mod nextclade_loop; pub mod nextclade_ordered_writer; +pub mod nextclade_read_annotation; +pub mod nextclade_seq_sort; pub mod verbosity; diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs b/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs deleted file mode 100644 index 5a16a16c1..000000000 --- a/packages_rs/nextclade-cli/src/cli/nextalign_cli.rs +++ /dev/null @@ -1,464 +0,0 @@ -use crate::cli::nextalign_loop::nextalign_run; -use crate::cli::nextclade_cli::{check_shells, SHELLS}; -use crate::cli::verbosity::{Verbosity, WarnLevel}; -use clap::{CommandFactory, Parser, Subcommand, ValueEnum, ValueHint}; -use clap_complete::{generate, Generator, Shell}; -use clap_complete_fig::Fig; -use eyre::{eyre, ContextCompat, Report, WrapErr}; -use itertools::Itertools; -use nextclade::align::params::AlignPairwiseParamsOptional; -use nextclade::io::fs::add_extension; -use nextclade::make_error; -use nextclade::utils::global_init::setup_logger; -use std::fmt::Debug; -use std::io; -use std::path::PathBuf; -use strum::IntoEnumIterator; -use strum_macros::EnumIter; - -#[derive(Parser, Debug)] -#[clap(name = "nextalign")] -#[clap(author, version)] -#[clap(verbatim_doc_comment)] -/// Viral sequence alignment and translation. -/// -/// Nextalign is a part of Nextstrain: https://nextstrain.org -/// -/// Documentation: https://docs.nextstrain.org/projects/nextclade -/// Nextclade Web: https://clades.nextstrain.org -/// Publication: https://doi.org/10.21105/joss.03773 -/// -/// Please read short help with `nextalign -h` and extended help with `nextalign --help`. Each subcommand has its own help, for example: `nextclade run --help`. -pub struct NextalignArgs { - #[clap(subcommand)] - pub command: NextalignCommands, - - /// Make output more quiet or more verbose - #[clap(flatten, next_help_heading = " Verbosity")] - pub verbosity: Verbosity, -} - -#[derive(Subcommand, Debug)] -#[clap(verbatim_doc_comment)] -pub enum NextalignCommands { - /// Generate shell completions. - /// - /// This will print the completions file contents to the console. Refer to your shell's documentation on how to install the completions. - /// - /// Example for Ubuntu Linux: - /// - /// nextalign completions bash > ~/.local/share/bash-completion/nextalign - /// - Completions { - /// Name of the shell to generate appropriate completions - #[clap(value_name = "SHELL", default_value_t = String::from("bash"), value_parser = check_shells)] - shell: String, - }, - - /// Run alignment and translation. - /// - /// For short help type: `nextclade -h`, for extended help type: `nextclade --help`. Each subcommand has its own help, for example: `nextclade run --help`. - Run(Box), -} - -#[derive(Copy, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, EnumIter)] -pub enum NextalignOutputSelection { - All, - Fasta, - Translations, - Insertions, - Errors, -} - -#[derive(Parser, Debug)] -pub struct NextalignRunInputArgs { - /// Path to one or multiple FASTA files with input sequences - /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". If no files provided, the plain fasta input is read from standard input (stdin). - /// - /// See: https://en.wikipedia.org/wiki/FASTA_format - #[clap(value_hint = ValueHint::FilePath)] - #[clap(display_order = 1)] - pub input_fastas: Vec, - - /// REMOVED. Use positional arguments instead. - /// - /// Example: nextalign run -D dataset/ -O out/ seq1.fasta seq2.fasta - #[clap(long, short = 'i', visible_alias("sequences"))] - #[clap(value_hint = ValueHint::FilePath)] - #[clap(hide_long_help = true, hide_short_help = true)] - pub input_fasta: Option, - - /// Path to a FASTA file containing reference sequence. This file should contain exactly 1 sequence. - /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". Use "-" to read uncompressed data from standard input (stdin). - #[clap(long, short = 'r', visible_alias("reference"))] - #[clap(value_hint = ValueHint::FilePath)] - pub input_ref: PathBuf, - - /// Path to a .gff file containing the gene map (genome annotation). - /// - /// Gene map (sometimes also called 'genome annotation') is used to find coding regions. If not supplied, coding regions will - /// not be translated, amino acid sequences will not be output, and nucleotide sequence - /// alignment will not be informed by codon boundaries - /// - /// List of genes can be restricted using `--genes` flag. Otherwise all genes found in the gene map will be used. - /// - /// Learn more about Generic Feature Format Version 3 (GFF3): - /// https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md - /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". Use "-" to read uncompressed data from standard input (stdin). - #[clap(long, short = 'm', alias = "genemap")] - #[clap(value_hint = ValueHint::FilePath)] - pub input_gene_map: Option, - - /// Comma-separated list of names of genes to use. - /// - /// This defines which peptides will be written into outputs, and which genes will be taken into account during - /// codon-aware alignment. Must only contain gene names present in the gene map. If - /// this flag is not supplied or its value is an empty string, then all genes found in the gene map will be used. - /// - /// Requires `--input-gene-map` to be specified. - #[clap( - long, - short = 'g', - num_args=1.., - use_value_delimiter = true - )] - #[clap(value_hint = ValueHint::FilePath)] - pub genes: Option>, -} - -#[derive(Parser, Debug)] -pub struct NextalignRunOutputArgs { - /// REMOVED. Use `--output-all` instead - #[clap(long)] - #[clap(value_hint = ValueHint::DirPath)] - #[clap(hide_long_help = true, hide_short_help = true)] - pub output_dir: Option, - - /// Produce all of the output files into this directory, using default basename and predefined suffixes and extensions. This is equivalent to specifying each of the individual `--output-*` flags. Convenient when you want to receive all or most of output files into the same directory and don't care about their filenames. - /// - /// Output files can be optionally included or excluded using `--output-selection` flag. - /// The base filename can be set using `--output-basename` flag. - /// - /// If both the `--output-all` and individual `--output-*` flags are provided, each individual flag overrides the corresponding default output path. - /// - /// At least one of the output flags is required: `--output-all`, `--output-fasta`, `--output-translations`, `--output-insertions`, `--output-errors` - /// - /// If the required directory tree does not exist, it will be created. - #[clap(long, short = 'O')] - #[clap(value_hint = ValueHint::DirPath)] - pub output_all: Option, - - /// Set the base filename to use for output files. - /// - /// By default the base filename is extracted from the input sequences file (provided with `--input-fasta`). - /// - /// Only valid together with `--output-all` flag. - #[clap(long, short = 'n')] - #[clap(requires = "output_all")] - pub output_basename: Option, - - /// Restricts outputs for `--output-all` flag. - /// - /// Should contain a comma-separated list of names of output files to produce. - /// - /// If 'all' is present in the list, then all other entries are ignored and all outputs are produced. - /// - /// Only valid together with `--output-all` flag. - #[clap( - long, - short = 's', - num_args=1.., - use_value_delimiter = true - )] - #[clap(requires = "output_all")] - #[clap(value_enum)] - pub output_selection: Vec, - - /// Path to output FASTA file with aligned sequences. - /// - /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. - /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). - /// - /// If the required directory tree does not exist, it will be created. - #[clap(long, short = 'o')] - #[clap(value_hint = ValueHint::AnyPath)] - pub output_fasta: Option, - - /// Template string for path to output fasta files containing translated and aligned peptides. A separate file will be generated for every gene. - /// The string should contain template variable `{gene}`, where the gene name will be substituted. - /// Make sure you properly quote and/or escape the curly braces, so that your shell, programming language or pipeline manager does not attempt to substitute the variables. - /// - /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. - /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). - /// - /// If the required directory tree does not exist, it will be created. - /// - /// Example for bash shell: - /// - /// --output-translations='output_dir/gene_{gene}.translation.fasta' - #[clap(long, short = 'P')] - #[clap(value_hint = ValueHint::AnyPath)] - pub output_translations: Option, - - /// Path to output CSV file that contain insertions stripped from the reference alignment. - /// - /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. - /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). - /// - /// If the required directory tree does not exist, it will be created. - #[clap(long, short = 'I')] - #[clap(value_hint = ValueHint::AnyPath)] - pub output_insertions: Option, - - /// Path to output CSV file containing errors and warnings occurred during processing - /// - /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. - /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). - /// - /// If the required directory tree does not exist, it will be created. - #[clap(long, short = 'e')] - #[clap(value_hint = ValueHint::AnyPath)] - pub output_errors: Option, - - /// Whether to include aligned reference nucleotide sequence into output nucleotide sequence FASTA file and reference peptides into output peptide FASTA files. - #[clap(long)] - pub include_reference: bool, - - /// Emit output sequences in-order. - /// - /// With this flag the program will wait for results from the previous sequences to be written to the output files before writing the results of the next sequences, preserving the same order as in the input file. Due to variable sequence processing times, this might introduce unnecessary waiting times, but ensures that the resulting sequences are written in the same order as they occur in the inputs (except for sequences which have errors). - /// By default, without this flag, processing might happen out of order, which is faster, due to the elimination of waiting, but might also lead to results written out of order - the order of results is not specified and depends on thread scheduling and processing times of individual sequences. - /// - /// This option is only relevant when `--jobs` is greater than 1 or is omitted. - /// - /// Note: the sequences which trigger errors during processing will be omitted from outputs, regardless of this flag. - #[clap(long)] - pub in_order: bool, - - /// Replace unknown nucleotide characters with 'N' - /// - /// By default, the sequences containing unknown nucleotide nucleotide characters are skipped with a warning - they - /// are not aligned and not included into results. If this flag is provided, then before the alignment, - /// all unknown characters are replaced with 'N'. This replacement allows to align these sequences. - /// - /// The following characters are considered known: '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y' - #[clap(long)] - pub replace_unknown: bool, -} - -#[derive(Parser, Debug)] -pub struct NextalignRunOtherArgs { - /// Number of processing jobs. If not specified, all available CPU threads will be used. - #[clap(global = false, long, short = 'j', default_value_t = num_cpus::get())] - pub jobs: usize, -} - -#[derive(Parser, Debug)] -pub struct NextalignRunArgs { - #[clap(flatten, next_help_heading = " Inputs")] - pub inputs: NextalignRunInputArgs, - - #[clap(flatten, next_help_heading = " Outputs")] - pub outputs: NextalignRunOutputArgs, - - #[clap(flatten, next_help_heading = " Alignment parameters")] - pub alignment_params: AlignPairwiseParamsOptional, - - #[clap(flatten, next_help_heading = " Other")] - pub other: NextalignRunOtherArgs, -} - -fn generate_completions(shell: &str) -> Result<(), Report> { - let mut command = NextalignArgs::command(); - - if shell.to_lowercase() == "fig" { - generate(Fig, &mut command, "nextalign", &mut io::stdout()); - return Ok(()); - } - - let generator = Shell::from_str(&shell.to_lowercase(), true) - .map_err(|err| eyre!("{}: Possible values: {}", err, SHELLS.join(", ")))?; - - let bin_name = command.get_name().to_owned(); - - generate(generator, &mut command, bin_name, &mut io::stdout()); - - Ok(()) -} - -/// Get output filenames provided by user or, if not provided, create filenames based on input fasta -pub fn nextalign_get_output_filenames(run_args: &mut NextalignRunArgs) -> Result<(), Report> { - let NextalignRunArgs { - inputs: - NextalignRunInputArgs { - input_fastas, - input_ref, - input_gene_map, - genes, - .. - }, - outputs: - NextalignRunOutputArgs { - output_all, - output_basename, - output_selection, - output_fasta, - output_translations, - output_insertions, - output_errors, - include_reference, - in_order, - .. - }, - other: NextalignRunOtherArgs { jobs }, - alignment_params, - } = run_args; - - // If `--output-all` is provided, then we need to deduce default output filenames, - // while taking care to preserve values of any individual `--output-*` flags, - // as well as to honor restrictions put by the `--output-selection` flag, if provided. - if let Some(output_all) = output_all { - let output_basename = output_basename.clone().unwrap_or_else(|| "nextalign".to_owned()); - - let default_output_file_path = output_all.join(&output_basename); - - // If `--output-selection` is empty or contains `all`, then fill it with all possible variants - if output_selection.is_empty() || output_selection.contains(&NextalignOutputSelection::All) { - *output_selection = NextalignOutputSelection::iter().collect_vec(); - } - - // We use `Option::get_or_insert()` mutable method here in order - // to set default output filenames only if they are not provided. - - if output_selection.contains(&NextalignOutputSelection::Fasta) { - output_fasta.get_or_insert(add_extension(&default_output_file_path, "aligned.fasta")); - } - - if output_selection.contains(&NextalignOutputSelection::Insertions) { - let output_insertions = - output_insertions.get_or_insert(add_extension(&default_output_file_path, "insertions.csv")); - } - - if output_selection.contains(&NextalignOutputSelection::Errors) { - let output_errors = output_errors.get_or_insert(add_extension(&default_output_file_path, "errors.csv")); - } - - if output_selection.contains(&NextalignOutputSelection::Translations) { - let output_translations = { - let output_translations_path = - default_output_file_path.with_file_name(format!("{output_basename}_gene_{{gene}}")); - let output_translations_path = add_extension(output_translations_path, "translation.fasta"); - - let output_translations_template = output_translations_path - .to_str() - .wrap_err_with(|| format!("When converting path to string: '{output_translations_path:?}'"))? - .to_owned(); - - output_translations.get_or_insert(output_translations_template) - }; - } - } - - if let Some(output_translations) = output_translations { - if !output_translations.contains("{gene}") { - return make_error!( - r#" -Expected `--output-translations` argument to contain a template string containing template variable {{gene}} (with curly braces), but received: - - {output_translations} - -Make sure the variable is not substituted by your shell, programming language or workflow manager. Apply proper escaping as needed. -Example for bash shell: - - --output-translations='output_dir/gene_{{gene}}.translation.fasta' - - "# - ); - } - } - - let all_outputs_are_missing = [output_all, output_fasta, output_insertions, output_errors] - .iter() - .all(|o| o.is_none()) - && output_translations.is_none(); - - if all_outputs_are_missing { - return make_error!( - r#"No output flags provided. - -At least one of the following flags is required: - --output-all - --output-fasta - --output-translations - --output-insertions - --output-errors"# - ); - } - - Ok(()) -} - -const ERROR_MSG_INPUT_FASTA_REMOVED: &str = r#"The argument `--input-fasta` (alias: `--sequences`, `-i`) is removed in favor of positional arguments. - -Try: - - nextalign run -r ref.fasta -m genemap.gff -O out/ seq1.fasta seq2.fasta - - ^ ^ - one or multiple positional arguments - with paths to input fasta files - - -When positional arguments are not provided, nextalign will read input fasta from standard input. - -For more information, type - - nextalign run --help"#; - -const ERROR_MSG_OUTPUT_DIR_REMOVED: &str = r#"The argument `--output-dir` is removed in favor of `--output-all`. - -When provided, `--output-all` allows to write all possible outputs into a directory. - -The defaut base name of the files can be overriden with `--output-basename` argument. - -The set of output files can be restricted with `--output-selection` argument. - -For more information, type: - - nextalign run --help"#; - -pub fn nextalign_check_removed_args(run_args: &mut NextalignRunArgs) -> Result<(), Report> { - if run_args.inputs.input_fasta.is_some() { - return make_error!("{ERROR_MSG_INPUT_FASTA_REMOVED}"); - } - - if run_args.outputs.output_dir.is_some() { - return make_error!("{ERROR_MSG_OUTPUT_DIR_REMOVED}"); - } - - Ok(()) -} - -pub fn nextalign_handle_cli_args() -> Result<(), Report> { - let args = NextalignArgs::parse(); - - setup_logger(args.verbosity.get_filter_level()); - - match args.command { - NextalignCommands::Completions { shell } => { - generate_completions(&shell).wrap_err_with(|| format!("When generating completions for shell '{shell}'")) - } - NextalignCommands::Run(mut run_args) => { - nextalign_check_removed_args(&mut run_args)?; - nextalign_get_output_filenames(&mut run_args).wrap_err("When deducing output filenames")?; - nextalign_run(*run_args) - } - } -} diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs b/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs deleted file mode 100644 index 2e8c7b95c..000000000 --- a/packages_rs/nextclade-cli/src/cli/nextalign_loop.rs +++ /dev/null @@ -1,183 +0,0 @@ -use crate::cli::nextalign_cli::{ - NextalignRunArgs, NextalignRunInputArgs, NextalignRunOtherArgs, NextalignRunOutputArgs, -}; -use crate::cli::nextalign_ordered_writer::NextalignOrderedWriter; -use eyre::{Report, WrapErr}; -use log::info; -use nextclade::align::gap_open::{get_gap_open_close_scores_codon_aware, get_gap_open_close_scores_flat}; -use nextclade::align::params::AlignPairwiseParams; -use nextclade::align::seed_match2::CodonSpacedIndex; -use nextclade::alphabet::nuc::{to_nuc_seq, to_nuc_seq_replacing}; -use nextclade::gene::gene_map::{filter_gene_map, GeneMap}; -use nextclade::gene::gene_map_display::gene_map_to_table_string; -use nextclade::io::fasta::{read_one_fasta, FastaReader, FastaRecord}; -use nextclade::run::nextalign_run_one::nextalign_run_one; -use nextclade::translate::translate_genes_ref::translate_genes_ref; -use nextclade::types::outputs::NextalignOutputs; - -pub struct NextalignRecord { - pub index: usize, - pub seq_name: String, - pub outputs_or_err: Result, -} - -pub fn nextalign_run(run_args: NextalignRunArgs) -> Result<(), Report> { - info!("Command-line arguments:\n{run_args:#?}"); - - let NextalignRunArgs { - inputs: - NextalignRunInputArgs { - input_fastas, - input_ref, - input_gene_map, - genes, - .. - }, - outputs: - NextalignRunOutputArgs { - output_all, - output_basename, - output_selection, - output_fasta, - output_translations, - output_insertions, - output_errors, - include_reference, - replace_unknown, - in_order, - .. - }, - other: NextalignRunOtherArgs { jobs }, - alignment_params: alignment_params_from_cli, - } = run_args; - - let mut alignment_params = AlignPairwiseParams::default(); - - // Merge alignment params coming from CLI arguments - alignment_params.merge_opt(alignment_params_from_cli); - - let ref_record = &read_one_fasta(input_ref)?; - let ref_seq = &to_nuc_seq(&ref_record.seq).wrap_err("When reading reference sequence")?; - let seed_index = &CodonSpacedIndex::from_sequence(ref_seq); - - let gene_map = match input_gene_map { - Some(input_gene_map) => { - let gene_map = GeneMap::from_file(input_gene_map)?; - filter_gene_map(Some(gene_map), &genes)? - } - None => GeneMap::new(), - }; - - info!("Gene map:\n{}", gene_map_to_table_string(&gene_map)?); - - let gap_open_close_nuc = &get_gap_open_close_scores_codon_aware(ref_seq, &gene_map, &alignment_params); - let gap_open_close_aa = &get_gap_open_close_scores_flat(ref_seq, &alignment_params); - - let ref_peptides = &translate_genes_ref(ref_seq, &gene_map, &alignment_params)?; - - std::thread::scope(|s| { - const CHANNEL_SIZE: usize = 128; - let (fasta_sender, fasta_receiver) = crossbeam_channel::bounded::(CHANNEL_SIZE); - let (result_sender, result_receiver) = crossbeam_channel::bounded::(CHANNEL_SIZE); - - s.spawn(|| { - let mut reader = FastaReader::from_paths(&input_fastas).unwrap(); - loop { - let mut record = FastaRecord::default(); - reader.read(&mut record).unwrap(); - if record.is_empty() { - break; - } - fasta_sender - .send(record) - .wrap_err("When sending a FastaRecord") - .unwrap(); - } - drop(fasta_sender); - }); - - let gene_map = &gene_map; - for _ in 0..jobs { - let fasta_receiver = fasta_receiver.clone(); - let result_sender = result_sender.clone(); - let gap_open_close_nuc = &gap_open_close_nuc; - let gap_open_close_aa = &gap_open_close_aa; - let alignment_params = &alignment_params; - - s.spawn(move || { - let result_sender = result_sender.clone(); - - for FastaRecord { seq_name, seq, index } in &fasta_receiver { - info!("Processing sequence '{seq_name}'"); - - let outputs_or_err = if replace_unknown { - Ok(to_nuc_seq_replacing(&seq)) - } else { - to_nuc_seq(&seq) - } - .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'")) - .and_then(|qry_seq| { - nextalign_run_one( - index, - &seq_name, - &qry_seq, - ref_seq, - seed_index, - ref_peptides, - gene_map, - gap_open_close_nuc, - gap_open_close_aa, - alignment_params, - ) - }); - - let record = NextalignRecord { - index, - seq_name, - outputs_or_err, - }; - - // Important: **all** records should be sent into this channel, without skipping. - // In in-order mode, writer that receives from this channel expects a contiguous stream of indices. Gaps in - // the indices will cause writer to stall waiting for the missing index and the buffering queue to grow. Any - // filtering of records should be done in the writer, instead of here. - result_sender - .send(record) - .wrap_err("When sending NextalignRecord") - .unwrap(); - } - - drop(result_sender); - }); - } - - s.spawn(move || { - let mut output_writer = NextalignOrderedWriter::new( - gene_map, - &output_fasta, - &output_translations, - &output_insertions, - &output_errors, - in_order, - ) - .wrap_err("When creating output writer") - .unwrap(); - - if include_reference { - output_writer - .write_ref(ref_record, ref_peptides) - .wrap_err("When writing output record for ref sequence") - .unwrap(); - } - - for record in result_receiver { - output_writer - .write_record(record) - .wrap_err("When writing output record") - .unwrap(); - } - }); - }); - - Ok(()) -} diff --git a/packages_rs/nextclade-cli/src/cli/nextalign_ordered_writer.rs b/packages_rs/nextclade-cli/src/cli/nextalign_ordered_writer.rs deleted file mode 100644 index 9498ffe00..000000000 --- a/packages_rs/nextclade-cli/src/cli/nextalign_ordered_writer.rs +++ /dev/null @@ -1,184 +0,0 @@ -use crate::cli::nextalign_loop::NextalignRecord; -use eyre::{Report, WrapErr}; -use log::{info, warn}; -use nextclade::alphabet::nuc::from_nuc_seq; -use nextclade::gene::gene_map::GeneMap; -use nextclade::io::errors_csv::ErrorsCsvWriter; -use nextclade::io::fasta::{FastaPeptideWriter, FastaRecord, FastaWriter}; -use nextclade::io::insertions_csv::InsertionsCsvWriter; -use nextclade::translate::translate_genes::Translation; -use nextclade::types::outputs::NextalignOutputs; -use nextclade::utils::error::report_to_string; -use nextclade::utils::option::OptionMapRefFallible; -use std::collections::HashMap; -use std::path::PathBuf; - -/// Writes output files, potentially preserving the initial order of records (same as in the inputs) -pub struct NextalignOrderedWriter<'a> { - fasta_writer: Option, - fasta_peptide_writer: Option, - insertions_csv_writer: Option, - errors_csv_writer: Option>, - expected_index: usize, - queue: HashMap, - in_order: bool, -} - -impl<'a> NextalignOrderedWriter<'a> { - pub fn new( - gene_map: &'a GeneMap, - output_fasta: &Option, - output_translations: &Option, - output_insertions: &Option, - output_errors: &Option, - in_order: bool, - ) -> Result { - let fasta_writer = output_fasta.map_ref_fallible(FastaWriter::from_path)?; - - let fasta_peptide_writer = output_translations - .map_ref_fallible(|output_translations| FastaPeptideWriter::new(gene_map, output_translations))?; - - let insertions_csv_writer = output_insertions.map_ref_fallible(InsertionsCsvWriter::new)?; - - let errors_csv_writer = - output_errors.map_ref_fallible(|output_errors| ErrorsCsvWriter::new(gene_map, output_errors))?; - - Ok(Self { - fasta_writer, - fasta_peptide_writer, - insertions_csv_writer, - errors_csv_writer, - expected_index: 0, - queue: HashMap::::new(), - in_order, - }) - } - - pub fn write_ref(&mut self, ref_record: &FastaRecord, ref_translation: &Translation) -> Result<(), Report> { - let FastaRecord { seq_name, seq, .. } = &ref_record; - - if let Some(fasta_writer) = &mut self.fasta_writer { - fasta_writer.write(seq_name, seq, false)?; - } - - ref_translation.cdses().try_for_each(|cds_tr| { - if let Some(fasta_peptide_writer) = &mut self.fasta_peptide_writer { - fasta_peptide_writer.write(seq_name, cds_tr)?; - } - Result::<(), Report>::Ok(()) - })?; - - Ok(()) - } - - /// Writes output record into output files - fn write_impl(&mut self, record: &NextalignRecord) -> Result<(), Report> { - let NextalignRecord { - index, - seq_name, - outputs_or_err, - } = record; - - match outputs_or_err { - Ok(output) => { - let NextalignOutputs { - stripped, - alignment, - translation, - aa_insertions, - warnings, - missing_genes, - is_reverse_complement, - .. - } = output; - - if let Some(fasta_writer) = &mut self.fasta_writer { - fasta_writer.write(seq_name, &from_nuc_seq(&stripped.qry_seq), *is_reverse_complement)?; - } - - if let Some(fasta_peptide_writer) = &mut self.fasta_peptide_writer { - for translation in translation.cdses() { - fasta_peptide_writer.write(seq_name, translation)?; - } - } - - if let Some(insertions_csv_writer) = &mut self.insertions_csv_writer { - insertions_csv_writer.write(seq_name, &stripped.insertions, aa_insertions)?; - } - - for warning in warnings { - info!("In sequence #{index} '{seq_name}': {}", warning.warning); - } - - if let Some(errors_csv_writer) = &mut self.errors_csv_writer { - errors_csv_writer.write_aa_errors(seq_name, warnings, missing_genes)?; - } - } - Err(report) => { - let cause = report_to_string(report); - let message = format!( - "In sequence #{index} '{seq_name}': {cause}. Note that this sequence will not be included in the results." - ); - warn!("{message}"); - if let Some(insertions_csv_writer) = &mut self.insertions_csv_writer { - insertions_csv_writer.write(seq_name, &[], &[])?; - } - if let Some(errors_csv_writer) = &mut self.errors_csv_writer { - errors_csv_writer.write_nuc_error(seq_name, &message)?; - } - } - } - - Ok(()) - } - - /// In in-order mode, writes all queued records with indices subsequent to the next expected index. - /// On out-of-order mode, does nothing - the queue is always empty. - fn write_queued_records(&mut self) -> Result<(), Report> { - while let Some(record) = self.queue.remove(&self.expected_index) { - self.write_impl(&record)?; - self.expected_index += 1; - } - Ok(()) - } - - /// Writes a record. - /// - /// In in-order mode, if one or more of the preceding records has not been written yet (according to the record index - /// derived from order of records in the input files) then the current record is queued to be written at a later time. - /// This ensures that the records in output files are in the same order as in the input files. - /// - /// In out-of-order mode, records are written as they come from worker threads. In this case the order in output files - /// is not defined (due to differences in processing times between items, and thread scheduling between runs) - pub fn write_record(&mut self, record: NextalignRecord) -> Result<(), Report> { - if !self.in_order { - // Out-of-order mode: write immediately - self.write_impl(&record)?; - } else { - // In-order mode: check if the record has next expected index - if record.index == self.expected_index { - // If the record has next expected index, write it immediately - self.write_impl(&record)?; - self.expected_index += 1; - } else { - // If the record has an unexpected index, queue it to write later - self.queue.insert(record.index, record); - } - - // Periodically try to write the queued records - self.write_queued_records()?; - } - Ok(()) - } - - /// Finalizes output by writing all queued records - pub fn finish(&mut self) -> Result<(), Report> { - self.write_queued_records() - } -} - -impl<'a> Drop for NextalignOrderedWriter<'a> { - fn drop(&mut self) { - self.finish().wrap_err("When finalizing output writer").unwrap(); - } -} diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs b/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs index d63a96a03..c619138f2 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs @@ -1,17 +1,20 @@ use crate::cli::nextclade_dataset_get::nextclade_dataset_get; use crate::cli::nextclade_dataset_list::nextclade_dataset_list; use crate::cli::nextclade_loop::nextclade_run; +use crate::cli::nextclade_read_annotation::nextclade_read_annotation; +use crate::cli::nextclade_seq_sort::nextclade_seq_sort; use crate::cli::verbosity::{Verbosity, WarnLevel}; use crate::io::http_client::ProxyConfig; +use clap::builder::styling; use clap::{ArgGroup, CommandFactory, Parser, Subcommand, ValueEnum, ValueHint}; use clap_complete::{generate, Generator, Shell}; use clap_complete_fig::Fig; use eyre::{eyre, ContextCompat, Report, WrapErr}; use itertools::Itertools; use lazy_static::lazy_static; -use nextclade::align::params::AlignPairwiseParamsOptional; use nextclade::io::fs::add_extension; -use nextclade::tree::params::TreeBuilderParamsOptional; +use nextclade::run::params::NextcladeInputParamsOptional; +use nextclade::sort::params::NextcladeSeqSortParams; use nextclade::utils::global_init::setup_logger; use nextclade::{getenv, make_error}; use std::fmt::Debug; @@ -28,17 +31,19 @@ lazy_static! { pub static ref SHELLS: Vec<&'static str> = ["bash", "elvish", "fish", "fig", "powershell", "zsh"].to_vec(); } -pub fn check_shells(value: &str) -> Result { - SHELLS - .contains(&value) - .then_some(value.to_owned()) - .ok_or_else(|| eyre!("Unknown shell: '{value}'. Possible values: {}", SHELLS.join(", "))) +fn styles() -> styling::Styles { + styling::Styles::styled() + .header(styling::AnsiColor::Green.on_default() | styling::Effects::BOLD) + .usage(styling::AnsiColor::Green.on_default() | styling::Effects::BOLD) + .literal(styling::AnsiColor::Blue.on_default() | styling::Effects::BOLD) + .placeholder(styling::AnsiColor::Cyan.on_default()) } #[derive(Parser, Debug)] #[clap(name = "nextclade")] #[clap(author, version)] #[clap(verbatim_doc_comment)] +#[clap(styles = styles())] /// Viral genome alignment, mutation calling, clade assignment, quality checks and phylogenetic placement. /// /// Nextclade is a part of Nextstrain: https://nextstrain.org @@ -53,7 +58,7 @@ pub struct NextcladeArgs { pub command: NextcladeCommands, /// Make output more quiet or more verbose - #[clap(flatten, next_help_heading = " Verbosity")] + #[clap(flatten, next_help_heading = "Verbosity")] pub verbosity: Verbosity, } @@ -70,19 +75,29 @@ pub enum NextcladeCommands { /// Completions { /// Name of the shell to generate appropriate completions - #[clap(value_name = "SHELL", default_value_t = String::from("bash"), value_parser = check_shells)] + #[clap(value_name = "SHELL", default_value_t = String::from("bash"), value_parser = SHELLS.clone())] shell: String, }, - /// Run alignment, mutation calling, clade assignment, quality checks and phylogenetic placement + /// Run sequence analysis: alignment, mutation calling, clade assignment, quality checks and phylogenetic placement /// /// For short help type: `nextclade -h`, for extended help type: `nextclade --help`. Each subcommand has its own help, for example: `nextclade run --help`. Run(Box), - /// List and download available Nextclade datasets + /// List and download available Nextclade datasets (pathogens) /// - /// For short help type: `nextclade -h`, for extended help type: `nextclade --help`. Each subcommand has its own help, for example: `nextclade run --help`. + /// For short help type: `nextclade -h`, for extended help type: `nextclade --help`. Each subcommand has its own help, for example: `nextclade dataset --help`. Dataset(Box), + + /// Sort sequences according to the inferred Nextclade dataset (pathogen) + /// + /// For short help type: `nextclade -h`, for extended help type: `nextclade --help`. Each subcommand has its own help, for example: `nextclade sort --help`. + Sort(Box), + + /// Read genome annotation and present it in Nextclade's internal formats. This is mostly only useful for Nextclade maintainers and the most curious users. Note that these internal formats have no stability guarantees and can be changed at any time without notice. + /// + /// For short help type: `nextclade -h`, for extended help type: `nextclade --help`. Each subcommand has its own help, for example: `nextclade sort --help`. + ReadAnnotation(Box), } #[derive(Parser, Debug)] @@ -105,49 +120,80 @@ pub enum NextcladeDatasetCommands { Get(NextcladeDatasetGetArgs), } +#[allow(clippy::struct_excessive_bools)] #[derive(Parser, Debug)] #[clap(verbatim_doc_comment)] pub struct NextcladeDatasetListArgs { - /// Restrict list to datasets with this name. Equivalent to `--attribute='name='`. + /// Restrict list to datasets with this exact name. + /// + /// Can be used to test if a dataset exists. #[clap(long, short = 'n')] #[clap(value_hint = ValueHint::Other)] pub name: Option, - /// Restrict list to datasets based on this reference sequence (given its accession ID). Equivalent to `--attribute='reference='`. - /// - /// Special values: "all" - shows datasets with any reference sequences; "default" - show only datasets with default reference sequence (as defined by the author of a given dataset). + /// REMOVED #[clap(long, short = 'r')] #[clap(value_hint = ValueHint::Other)] - #[clap(default_value = "all")] - pub reference: String, + #[clap(hide_long_help = true, hide_short_help = true)] + pub reference: Option, - /// Restrict list to datasets with this version tag. Equivalent to `--attribute='tag='`. + /// Restrict list to datasets with this exact version tag. #[clap(long, short = 't')] #[clap(value_hint = ValueHint::Other)] - #[clap(default_value = "latest")] - pub tag: String, + pub tag: Option, - /// Restrict list to only datasets with a given combination of attribute key-value pairs. - /// Keys and values are separated with an equality sign. - /// This flag can occur multiple times, for multiple attributes. - /// Example: `--attribute='reference=MN908947' --attribute='tag=2022-04-28T12:00:00Z'`. + /// REMOVED #[clap(long, short = 'a')] #[clap(value_hint = ValueHint::Other)] + #[clap(hide_long_help = true, hide_short_help = true)] pub attribute: Vec, - /// Include dataset version tags that are incompatible with this version of Nextclade CLI. By default the incompatible versions are omitted. + /// Include dataset versions that are incompatible with this version of Nextclade CLI. + /// + /// By default the incompatible versions are omitted. #[clap(long)] pub include_incompatible: bool, - /// Include older dataset version tags, additional to the latest. + /// REMOVED #[clap(long)] - pub include_old: bool, + #[clap(hide_long_help = true, hide_short_help = true)] + pub include_old: Option, + + /// Include deprecated datasets. + /// + /// By default the deprecated datasets are omitted. + /// + /// Authors can mark a dataset as deprecated to express that the dataset will no longer be updated and/or supported. Reach out to dataset authors for concrete details. + #[clap(long)] + pub include_deprecated: bool, + + /// Include experimental datasets. + /// + /// By default the experimental datasets are omitted. + /// + /// Authors can mark a dataset as experimental when development of the dataset is still in progress, or if the dataset is incomplete or of lower quality than usual. Use at own risk. Reach out to dataset authors if interested in further development and stabilizing of a particular dataset, and consider contributing. + #[clap(long)] + pub include_experimental: bool, + + /// Include community datasets. + /// + /// By default the community datasets are omitted. + /// + /// Community datasets are the datasets provided by the members of the broader Nextclade community. These datasets may vary in quality and completeness. Depending on authors' goals, these datasets may be created for specific purposes, rather than for general use. Nextclade team is unable to verify correctness of these datasets and does not provide support for them. For all questions regarding a concrete community dataset, please read its documentation and reach out to its authors. + #[clap(long)] + pub include_community: bool, /// Print output in JSON format. #[clap(long)] pub json: bool, - /// Use custom dataset server + /// Print only names of the datasets, without other details. + #[clap(long)] + pub only_names: bool, + + /// Use custom dataset server. + /// + /// You can host your own dataset server, with one or more datasets, grouped into dataset collections, and use this server to provide datasets to users of Nextclade CLI and Nextclade Web. Refer to Nextclade dataset documentation for more details. #[clap(long)] #[clap(value_hint = ValueHint::Url)] #[clap(default_value_t = Url::from_str(DATA_FULL_DOMAIN).expect("Invalid URL"))] @@ -161,37 +207,33 @@ pub struct NextcladeDatasetListArgs { #[clap(verbatim_doc_comment)] #[clap(group(ArgGroup::new("outputs").required(true).multiple(false)))] pub struct NextcladeDatasetGetArgs { - /// Name of the dataset to download. Equivalent to `--attribute='name='`. Use `dataset list` command to view available datasets. + /// Name of the dataset to download. Type `nextclade dataset list` to view available datasets. #[clap(long, short = 'n')] #[clap(value_hint = ValueHint::Other)] pub name: String, - /// Download dataset based on this reference sequence (given its accession ID). - /// If this flag is not provided or is 'default', will download dataset based on current default reference sequence, as defined by dataset maintainers. - /// The default reference sequence can change over time. Use `dataset list` command to view available options. - /// Equivalent to `--attribute='reference='`. + /// REMOVED #[clap(long, short = 'r')] #[clap(value_hint = ValueHint::Other)] - #[clap(default_value = "default")] - pub reference: String, + #[clap(hide_long_help = true, hide_short_help = true)] + pub reference: Option, /// Version tag of the dataset to download. - /// If this flag is not provided or is 'latest', then the latest **compatible** version is downloaded. - /// Equivalent to `--attribute='tag='`. + /// + /// If this flag is not provided the latest version is downloaded. #[clap(long, short = 't')] #[clap(value_hint = ValueHint::Other)] - #[clap(default_value = "latest")] - pub tag: String, + pub tag: Option, - /// Download dataset with a given combination of attribute key-value pairs. - /// Keys and values are separated with an equality sign. - /// This flag can occur multiple times, for multiple attributes. - /// Example: `--attribute='reference=MN908947' --attribute='tag=2022-04-28T12:00:00Z'`. + /// REMOVED #[clap(long, short = 'a')] #[clap(value_hint = ValueHint::Other)] + #[clap(hide_long_help = true, hide_short_help = true)] pub attribute: Vec, - /// Use custom dataset server + /// Use custom dataset server. + /// + /// You can host your own dataset server, with one or more datasets, grouped into dataset collections, and use this server to provide datasets to users of Nextclade CLI and Nextclade Web. Refer to Nextclade dataset documentation for more details. #[clap(long)] #[clap(value_hint = ValueHint::Url)] #[clap(default_value_t = Url::from_str(DATA_FULL_DOMAIN).expect("Invalid URL"))] @@ -234,19 +276,17 @@ pub enum NextcladeOutputSelection { Tree, TreeNwk, Translations, - Insertions, - Errors, } #[derive(Parser, Debug, Clone)] pub struct NextcladeRunInputArgs { /// Path to one or multiple FASTA files with input sequences /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". If no files provided, the plain fasta input is read from standard input (stdin). + /// Supports the following compression formats: "gz", "bz2", "xz", "zst". If no files provided, the plain fasta input is read from standard input (stdin). /// /// See: https://en.wikipedia.org/wiki/FASTA_format #[clap(value_hint = ValueHint::FilePath)] - #[clap(display_order = 1)] + #[clap(display_order = 0)] pub input_fastas: Vec, /// REMOVED. Use positional arguments instead. @@ -261,12 +301,12 @@ pub struct NextcladeRunInputArgs { /// /// See `nextclade dataset --help` on how to obtain datasets. /// - /// If this flag is not provided, the following individual input flags are required: `--input-root-seq`, - /// `--input-tree`, `--input-qc-config`, and the following individual input files are recommended: `--input-gene-map`, - /// `--input-pcr-primers`. + /// If this flag is not provided, no dataset will be loaded and individual input files have to be provided instead. In this case `--input-ref` is required and `--input-gene-map`, `--input-tree` and `--input-pathogen-json` are optional. /// /// If both the `--input-dataset` and individual `--input-*` flags are provided, each individual flag overrides the /// corresponding file in the dataset. + /// + /// Please refer to Nextclade documentation for more details about Nextclade datasets and their files. #[clap(long, short = 'D')] #[clap(value_hint = ValueHint::AnyPath)] pub input_dataset: Option, @@ -287,7 +327,7 @@ pub struct NextcladeRunInputArgs { /// /// Overrides path to `reference.fasta` in the dataset (`--input-dataset`). /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". Use "-" to read uncompressed data from standard input (stdin). + /// Supports the following compression formats: "gz", "bz2", "xz", "zst". Use "-" to read uncompressed data from standard input (stdin). #[clap(long, short = 'r', visible_alias("reference"), visible_alias("input-root-seq"))] #[clap(value_hint = ValueHint::FilePath)] pub input_ref: Option, @@ -298,61 +338,55 @@ pub struct NextcladeRunInputArgs { /// /// Overrides path to `tree.json` in the dataset (`--input-dataset`). /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". Use "-" to read uncompressed data from standard input (stdin). + /// Supports the following compression formats: "gz", "bz2", "xz", "zst". Use "-" to read uncompressed data from standard input (stdin). #[clap(long, short = 'a')] #[clap(value_hint = ValueHint::FilePath)] pub input_tree: Option, - /// Path to a JSON file containing configuration of Quality Control rules. - /// - /// Overrides path to `qc.json` in the dataset (`--input-dataset`). - /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". Use "-" to read uncompressed data from standard input (stdin). + /// REMOVED. The qc.json file have been merged into pathogen.json, see `--input-pathogen-json` #[clap(long, short = 'Q')] #[clap(value_hint = ValueHint::FilePath)] + #[clap(hide_long_help = true, hide_short_help = true)] pub input_qc_config: Option, /// Path to a JSON file containing configuration and data specific to a pathogen. /// /// Overrides path to `virus_properties.json` in the dataset (`--input-dataset`). /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". Use "-" to read uncompressed data from standard input (stdin). + /// Supports the following compression formats: "gz", "bz2", "xz", "zst". Use "-" to read uncompressed data from standard input (stdin). #[clap(long, short = 'R')] #[clap(value_hint = ValueHint::FilePath)] - pub input_virus_properties: Option, + pub input_pathogen_json: Option, - /// Path to a CSV file containing a list of custom PCR primer sites. This information is used to report mutations in these sites. - /// - /// Overrides path to `primers.csv` in the dataset (`--input-dataset`). - /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". Use "-" to read uncompressed data from standard input (stdin). + /// REMOVED. Merged into pathogen.json, see `--input-pathogen` #[clap(long, short = 'p')] #[clap(value_hint = ValueHint::FilePath)] + #[clap(hide_long_help = true, hide_short_help = true)] pub input_pcr_primers: Option, - /// Path to a .gff file containing the gene map (genome annotation). + /// Path to a GFF3 file containing (genome annotation). /// - /// Gene map (sometimes also called 'genome annotation') is used to find coding regions. If not supplied, coding regions will + /// Genome annotation is used to find coding regions. If not supplied, coding regions will /// not be translated, amino acid sequences will not be output, amino acid mutations will not be detected and nucleotide sequence /// alignment will not be informed by codon boundaries /// - /// List of genes can be restricted using `--genes` flag. Otherwise all genes found in the gene map will be used. + /// List of genes can be restricted using `--genes` flag. Otherwise all genes found in the genome annotation will be used. /// - /// Overrides path to `genemap.gff` provided by `--input-dataset`. + /// Overrides genome annotation provided by the dataset (`--input-dataset` or `--dataset-name`). /// /// Learn more about Generic Feature Format Version 3 (GFF3): /// https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md /// - /// Supports the following compression formats: "gz", "bz2", "xz", "zstd". Use "-" to read uncompressed data from standard input (stdin). + /// Supports the following compression formats: "gz", "bz2", "xz", "zst". Use "-" to read uncompressed data from standard input (stdin). #[clap(long, short = 'm', alias = "genemap")] #[clap(value_hint = ValueHint::FilePath)] - pub input_gene_map: Option, + pub input_annotation: Option, /// Comma-separated list of names of genes to use. /// /// This defines which peptides will be written into outputs, and which genes will be taken into account during - /// codon-aware alignment and aminoacid mutations detection. Must only contain gene names present in the gene map. If - /// this flag is not supplied or its value is an empty string, then all genes found in the gene map will be used. + /// codon-aware alignment and aminoacid mutations detection. Must only contain gene names present in the genome annotation. If + /// this flag is not supplied or its value is an empty string, then all genes found in the genome annotation will be used. /// /// Requires `--input-gene-map` to be specified. #[clap( @@ -387,7 +421,7 @@ pub struct NextcladeRunOutputArgs { /// /// If both the `--output-all` and individual `--output-*` flags are provided, each individual flag overrides the corresponding default output path. /// - /// At least one of the output flags is required: `--output-all`, `--output-fasta`, `--output-ndjson`, `--output-json`, `--output-csv`, `--output-tsv`, `--output-tree`, `--output-translations`, `--output-insertions`, `--output-errors` + /// At least one of the output flags is required: `--output-all`, `--output-fasta`, `--output-ndjson`, `--output-json`, `--output-csv`, `--output-tsv`, `--output-tree`, `--output-translations`. /// /// If the required directory tree does not exist, it will be created. #[clap(long, short = 'O')] @@ -424,7 +458,7 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. #[clap(long, short = 'o')] @@ -438,7 +472,7 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. /// @@ -455,7 +489,7 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. #[clap(long, short = 'N')] @@ -468,7 +502,7 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. #[clap(long, short = 'J')] @@ -483,7 +517,7 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. #[clap(long, short = 'c')] @@ -498,7 +532,7 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. #[clap(long, short = 't')] @@ -529,7 +563,7 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. #[clap(long)] @@ -545,7 +579,7 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. #[clap(long, short = 'T')] @@ -558,67 +592,28 @@ pub struct NextcladeRunOutputArgs { /// /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). /// /// If the required directory tree does not exist, it will be created. #[clap(long)] #[clap(value_hint = ValueHint::AnyPath)] pub output_tree_nwk: Option, - /// Path to output CSV file that contain insertions stripped from the reference alignment. - /// - /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. - /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). - /// - /// If the required directory tree does not exist, it will be created. + /// REMOVED. The argument `--output-insertions` have been removed in favor of `--output-csv` and `--output-tsv`. #[clap(long, short = 'I')] #[clap(value_hint = ValueHint::AnyPath)] + #[clap(hide_long_help = true, hide_short_help = true)] pub output_insertions: Option, - /// Path to output CSV file containing errors and warnings occurred during processing - /// - /// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`. - /// - /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zstd", then the file will be written compressed. Use "-" to write the uncompressed to standard output (stdout). - /// - /// If the required directory tree does not exist, it will be created. + /// REMOVED. The argument `--output-errors` have been removed in favor of `--output-csv` and `--output-tsv`. #[clap(long, short = 'e')] #[clap(value_hint = ValueHint::AnyPath)] + #[clap(hide_long_help = true, hide_short_help = true)] pub output_errors: Option, - - /// Whether to include aligned reference nucleotide sequence into output nucleotide sequence FASTA file and reference peptides into output peptide FASTA files. - #[clap(long)] - pub include_reference: bool, - - /// Whether to include the list of nearest nodes to the outputs - #[clap(long)] - pub include_nearest_node_info: bool, - - /// Emit output sequences in-order. - /// - /// With this flag the program will wait for results from the previous sequences to be written to the output files before writing the results of the next sequences, preserving the same order as in the input file. Due to variable sequence processing times, this might introduce unnecessary waiting times, but ensures that the resulting sequences are written in the same order as they occur in the inputs (except for sequences which have errors). - /// By default, without this flag, processing might happen out of order, which is faster, due to the elimination of waiting, but might also lead to results written out of order - the order of results is not specified and depends on thread scheduling and processing times of individual sequences. - /// - /// This option is only relevant when `--jobs` is greater than 1 or is omitted. - /// - /// Note: the sequences which trigger errors during processing will be omitted from outputs, regardless of this flag. - #[clap(long)] - pub in_order: bool, - - /// Replace unknown nucleotide characters with 'N' - /// - /// By default, the sequences containing unknown nucleotide characters are skipped with a warning - they - /// are not analyzed and not included into results. If this flag is provided, then before the alignment, - /// all unknown characters are replaced with 'N'. This replacement allows to analyze these sequences. - /// - /// The following characters are considered known: '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y' - #[clap(long)] - pub replace_unknown: bool, } #[derive(Parser, Debug, Clone)] -pub struct NextcladeRunOtherArgs { +pub struct NextcladeRunOtherParams { /// Number of processing jobs. If not specified, all available CPU threads will be used. #[clap(global = false, long, short = 'j', default_value_t = num_cpus::get())] pub jobs: usize, @@ -626,20 +621,120 @@ pub struct NextcladeRunOtherArgs { #[derive(Parser, Debug, Clone)] pub struct NextcladeRunArgs { - #[clap(flatten, next_help_heading = " Inputs")] + #[clap(flatten, next_help_heading = "Inputs")] pub inputs: NextcladeRunInputArgs, - #[clap(flatten, next_help_heading = " Outputs")] + #[clap(flatten, next_help_heading = "Outputs")] pub outputs: NextcladeRunOutputArgs, - #[clap(flatten, next_help_heading = " Phylogenetic tree parameters")] - pub tree_builder_params: TreeBuilderParamsOptional, + #[clap(flatten)] + pub params: NextcladeInputParamsOptional, + + #[clap(flatten, next_help_heading = "Other")] + pub other_params: NextcladeRunOtherParams, +} + +#[allow(clippy::struct_excessive_bools)] +#[derive(Parser, Debug)] +#[clap(verbatim_doc_comment)] +pub struct NextcladeSortArgs { + /// Path to one or multiple FASTA files with input sequences + /// + /// Supports the following compression formats: "gz", "bz2", "xz", "zst". If no files provided, the plain fasta input is read from standard input (stdin). + /// + /// See: https://en.wikipedia.org/wiki/FASTA_format + #[clap(value_hint = ValueHint::FilePath)] + pub input_fastas: Vec, + + /// Path to input minimizer index JSON file. + /// + /// By default the latest reference minimizer index is fetched from the dataset server (default or customized with `--server` argument). If this argument is provided, the algorithm skips fetching the default index and uses the index provided in the the JSON file. + /// + /// Supports the following compression formats: "gz", "bz2", "xz", "zst". Use "-" to read uncompressed data from standard input (stdin). + #[clap(long, short = 'm')] + #[clap(value_hint = ValueHint::FilePath)] + pub input_minimizer_index_json: Option, + + /// Path to output directory + /// + /// Sequences will be written in subdirectories: one subdirectory per dataset. Sequences inferred to be belonging to a particular dataset wil lbe places in the corresponding subdirectory. The subdirectory tree can be nested, depending on how dataset names are organized. + /// + /// Mutually exclusive with `--output`. + /// + #[clap(short = 'O', long)] + #[clap(value_hint = ValueHint::DirPath)] + #[clap(group = "outputs")] + pub output_dir: Option, + + /// Template string for the file path to output sorted sequences. A separate file will be generated per dataset. + /// + /// The string should contain template variable `{name}`, where the dataset name will be substituted. Note that if the `{name}` variable contains slashes, they will be interpreted as path segments and subdirectories will be created. + /// + /// Make sure you properly quote and/or escape the curly braces, so that your shell, programming language or pipeline manager does not attempt to substitute the variables. + /// + /// Mutually exclusive with `--output-dir`. + /// + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. If the required directory tree does not exist, it will be created. + /// + /// Example for bash shell: + /// + /// --output='outputs/{name}/sorted.fasta.gz' + #[clap(short = 'o', long)] + #[clap(group = "outputs")] + pub output_path: Option, + + /// Path to output results TSV file + /// + /// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed. Use "-" to write uncompressed to standard output (stdout). If the required directory tree does not exist, it will be created. + #[clap(short = 'r', long)] + #[clap(value_hint = ValueHint::FilePath)] + pub output_results_tsv: Option, - #[clap(flatten, next_help_heading = " Alignment parameters")] - pub alignment_params: AlignPairwiseParamsOptional, + #[clap(flatten, next_help_heading = "Algorithm")] + pub search_params: NextcladeSeqSortParams, - #[clap(flatten, next_help_heading = " Other")] - pub other: NextcladeRunOtherArgs, + #[clap(flatten, next_help_heading = "Other")] + pub other_params: NextcladeRunOtherParams, + + /// Use custom dataset server. + /// + /// You can host your own dataset server, with one or more datasets, grouped into dataset collections, and use this server to provide datasets to users of Nextclade CLI and Nextclade Web. Refer to Nextclade dataset documentation for more details. + #[clap(long)] + #[clap(value_hint = ValueHint::Url)] + #[clap(default_value_t = Url::from_str(DATA_FULL_DOMAIN).expect("Invalid URL"))] + pub server: Url, + + #[clap(flatten)] + pub proxy_config: ProxyConfig, +} + +#[allow(clippy::struct_excessive_bools)] +#[derive(Parser, Debug)] +#[clap(verbatim_doc_comment)] +pub struct NextcladeReadAnnotationArgs { + /// Genome annotation file in GFF3 format. + /// + /// Learn more about Generic Feature Format Version 3 (GFF3): + /// https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + /// + #[clap(value_hint = ValueHint::FilePath)] + #[clap(display_order = 0)] + pub input_annotation: Option, + + /// Path to output JSON or YAML file. + /// + /// The format is chosen based on file extension: ".json" or ".yaml". + #[clap(long, short = 'o')] + #[clap(value_hint = ValueHint::DirPath)] + pub output: Option, + + /// Present features in "feature tree" format. This format is a precursor of genome annotation format - it contains all genetic features, even the ones that Nextclade does not use, but also less information about each feature. + #[clap(long)] + pub feature_tree: bool, + + /// Print console output in JSON format, rather than human-readable table. + #[clap(long)] + pub json: bool, } fn generate_completions(shell: &str) -> Result<(), Report> { @@ -676,8 +771,6 @@ pub fn nextclade_get_output_filenames(run_args: &mut NextcladeRunArgs) -> Result output_tsv, output_tree, output_tree_nwk, - output_insertions, - output_errors, .. }, .. @@ -703,14 +796,6 @@ pub fn nextclade_get_output_filenames(run_args: &mut NextcladeRunArgs) -> Result output_fasta.get_or_insert(add_extension(&default_output_file_path, "aligned.fasta")); } - if output_selection.contains(&NextcladeOutputSelection::Insertions) { - output_insertions.get_or_insert(add_extension(&default_output_file_path, "insertions.csv")); - } - - if output_selection.contains(&NextcladeOutputSelection::Errors) { - output_errors.get_or_insert(add_extension(&default_output_file_path, "errors.csv")); - } - if output_selection.contains(&NextcladeOutputSelection::Translations) { let output_translations_path = default_output_file_path.with_file_name(format!("{output_basename}_gene_{{gene}}")); @@ -775,8 +860,6 @@ Example for bash shell: output_csv, output_tsv, output_tree, - output_insertions, - output_errors, ] .iter() .all(|o| o.is_none()) @@ -794,9 +877,7 @@ At least one of the following flags is required: --output-csv --output-tsv --output-tree - --output-translations - --output-insertions - --output-errors"# + --output-translations"# ); } @@ -831,15 +912,79 @@ For more information, type nextclade run --help"#; +const ERROR_MSG_INPUT_QC_CONFIG_REMOVED: &str = r#"The argument `--input-qc-config` is removed in favor of `--input-pathogen-json`. + +Since Nextclade v3, the `pathogen.json` file is an extended version of file known as `virus_properties.json` in Nextclade v2. The Nextclade v2 files `qc.json`, `primers.csv` and `tag.json` are now merged into `pathogen.json`. + +For more information, type + + nextclade run --help + +Read Nextclade documentation at: + + https://docs.nextstrain.org/projects/nextclade/en/stable"#; + +const ERROR_MSG_INPUT_PCR_PRIMERS_REMOVED: &str = r#"The argument `--input-pcr-primers` is removed in favor of `--input-pathogen-json`. + +Since Nextclade v3, the `pathogen.json` file is an extended version of file known as `virus_properties.json` in Nextclade v2. The Nextclade v2 files `qc.json`, `primers.csv` and `tag.json` are now merged into `pathogen.json`. + +For more information, type + + nextclade run --help + +Read Nextclade documentation at: + + https://docs.nextstrain.org/projects/nextclade/en/stable"#; + +const ERROR_MSG_OUTPUT_INSERTIONS_REMOVED: &str = r#"The argument `--output-insertions` have been removed in favor of `--output-csv` and `--output-tsv`. + +In Nextclade v3 the separate arguments `--output-insertions` and `--output-errors` are removed. Please use `--output-csv` (for semicolon-separated table) and `--output-tsv` (for tab-separated table) arguments instead. These tables contain, among others, all the columns from the output insertions table (`--output-insertions`) as well as from the output errors table (`--output-errors`). + +For more information, type + + nextclade run --help + +Read Nextclade documentation at: + + https://docs.nextstrain.org/projects/nextclade/en/stable"#; + +const ERROR_MSG_OUTPUT_ERRORS_REMOVED: &str = r#"The argument `--output-errors` have been removed in favor of `--output-csv` and `--output-tsv`. + +In Nextclade v3 the separate arguments `--output-insertions` and `--output-errors` are removed. Please use `--output-csv` (for semicolon-separated table) and `--output-tsv` (for tab-separated table) arguments instead. These tables contain, among others, all the columns from the output insertions table (`--output-insertions`) as well as from the output errors table (`--output-errors`). + +For more information, type + + nextclade run --help + +Read Nextclade documentation at: + + https://docs.nextstrain.org/projects/nextclade/en/stable"#; + pub fn nextclade_check_removed_args(run_args: &NextcladeRunArgs) -> Result<(), Report> { if run_args.inputs.input_fasta.is_some() { return make_error!("{ERROR_MSG_INPUT_FASTA_REMOVED}"); } + if run_args.inputs.input_qc_config.is_some() { + return make_error!("{ERROR_MSG_INPUT_QC_CONFIG_REMOVED}"); + } + + if run_args.inputs.input_pcr_primers.is_some() { + return make_error!("{ERROR_MSG_INPUT_PCR_PRIMERS_REMOVED}"); + } + if run_args.outputs.output_dir.is_some() { return make_error!("{ERROR_MSG_OUTPUT_DIR_REMOVED}"); } + if run_args.outputs.output_insertions.is_some() { + return make_error!("{ERROR_MSG_OUTPUT_INSERTIONS_REMOVED}"); + } + + if run_args.outputs.output_errors.is_some() { + return make_error!("{ERROR_MSG_OUTPUT_ERRORS_REMOVED}"); + } + Ok(()) } @@ -878,5 +1023,7 @@ pub fn nextclade_parse_cli_args() -> Result<(), Report> { NextcladeDatasetCommands::List(dataset_list_args) => nextclade_dataset_list(dataset_list_args), NextcladeDatasetCommands::Get(dataset_get_args) => nextclade_dataset_get(&dataset_get_args), }, + NextcladeCommands::Sort(seq_sort_args) => nextclade_seq_sort(&seq_sort_args), + NextcladeCommands::ReadAnnotation(read_annotation_args) => nextclade_read_annotation(&read_annotation_args), } } diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_dataset_get.rs b/packages_rs/nextclade-cli/src/cli/nextclade_dataset_get.rs index cf038de5a..22b9da1dd 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_dataset_get.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_dataset_get.rs @@ -1,151 +1,120 @@ use crate::cli::nextclade_cli::NextcladeDatasetGetArgs; -use crate::dataset::dataset_attributes::{format_attribute_list, parse_dataset_attributes}; use crate::dataset::dataset_download::{dataset_dir_download, dataset_zip_download, download_datasets_index_json}; -use crate::dataset::dataset_table::format_dataset_table; use crate::io::http_client::HttpClient; -use eyre::{eyre, Report, WrapErr}; +use eyre::{Report, WrapErr}; use itertools::Itertools; -use log::{info, LevelFilter}; +use log::{warn, LevelFilter}; use nextclade::io::dataset::{Dataset, DatasetsIndexJson}; -use nextclade::{getenv, make_error}; - -const THIS_VERSION: &str = getenv!("CARGO_PKG_VERSION"); +use nextclade::utils::info::{this_package_version, this_package_version_str}; +use nextclade::utils::string::find_similar_strings; +use nextclade::{make_error, make_internal_error}; pub struct DatasetHttpGetParams<'s> { pub name: &'s str, - pub reference: &'s str, pub tag: &'s str, } -pub fn nextclade_dataset_http_get( - http: &mut HttpClient, - DatasetHttpGetParams { name, reference, tag }: DatasetHttpGetParams, - attributes: &[String], -) -> Result { - let DatasetsIndexJson { datasets, .. } = download_datasets_index_json(http)?; +pub fn nextclade_dataset_get( + NextcladeDatasetGetArgs { + name, + reference, + tag, + attribute, + server, + output_dir, + output_zip, + proxy_config, + }: &NextcladeDatasetGetArgs, +) -> Result<(), Report> { + if reference.is_some() || !attribute.is_empty() { + return make_error!("The arguments `--reference` and `--attribute` are removed. Datasets are now queried by `--name` and `--tag` only.\n\nIn order to list all dataset names, type:\n\n nextclade dataset list --names-only\n\n. Please refer to `--help` and to Nextclade documentation for more details."); + } - // Parse attribute key-value pairs - let mut attributes = parse_dataset_attributes(attributes)?; + let verbose = log::max_level() > LevelFilter::Info; - // Handle special attributes differently - let name = if let Some(attr_name) = attributes.remove("name") { - attr_name - } else { - name.to_owned() - }; + let mut http = HttpClient::new(server, proxy_config, verbose)?; + let dataset = dataset_http_get(&mut http, name, tag)?; - if let Some(attr_reference) = attributes.remove("reference") { - attr_reference + if let Some(output_dir) = &output_dir { + dataset_dir_download(&mut http, &dataset, output_dir)?; + } else if let Some(output_zip) = &output_zip { + dataset_zip_download(&mut http, &dataset, output_zip)?; } else { - reference.to_owned() - }; + } - if let Some(attr_tag) = attributes.remove("tag") { - attr_tag - } else { - tag.to_owned() - }; + Ok(()) +} + +pub fn dataset_http_get(http: &mut HttpClient, name: impl AsRef, tag: &Option) -> Result { + let name = name.as_ref(); + let tag = tag.as_ref(); - let mut filtered = datasets + let DatasetsIndexJson { collections, .. } = download_datasets_index_json(http)?; + + let datasets = collections .into_iter() - .filter(|dataset| dataset.enabled) + .flat_map(|collection| collection.datasets) + .collect_vec(); + + let paths = datasets.iter().map(|dataset| dataset.path.clone()).collect_vec(); + + let mut filtered = datasets.into_iter().filter(Dataset::is_enabled) .filter(|dataset| -> bool { // If a concrete version `tag` is specified, we skip 'enabled', 'compatibility' and 'latest' checks - if tag == "latest" { - let is_not_old = dataset.is_latest(); - let is_compatible = dataset.is_compatible(THIS_VERSION); - is_compatible && is_not_old + if let Some(tag) = tag.as_ref() { + dataset.is_tag(tag) } else { - dataset.attributes.tag.value == tag - } - }) - // Filter by reference sequence - .filter(|dataset| { - if reference == "default" { - dataset.attributes.reference.is_default - } else { - dataset.attributes.reference.value == reference + dataset.is_latest() } }) // Filter by name .filter(|dataset| { - dataset.attributes.name.value == name - }) - // Filter by remaining attributes - .filter(|dataset| { - let mut should_include = true; - for (key, val) in &attributes { - let is_attr_matches = match dataset.attributes.rest_attrs.get(key) { - Some(attr) => { - if val == "default" { - attr.is_default - } else { - &attr.value == val - } - } - None => false - }; - should_include = should_include && is_attr_matches; - } - should_include + dataset.path == name }) .collect_vec(); - let attributes_fmt = { - let attributes_fmt = format_attribute_list(&Some(name), reference, tag, &attributes); - if attributes_fmt.is_empty() { - "".to_owned() - } else { - format!(" having attributes: {attributes_fmt}") + let dataset = match &filtered.len() { + 0 => { + let suggestions = find_similar_strings(paths.iter(), &name).take(10).collect_vec(); + let suggestions_msg = (!suggestions.is_empty()) + .then(|| { + let suggestions = suggestions.iter().map(|s| format!("- {s}")).join("\n"); + format!("\n\nDid you mean:\n{suggestions}\n?") + }) + .unwrap_or_default(); + make_error!( + "Dataset not found: '{name}'.{suggestions_msg}\n\nType `nextclade dataset list` to show available datasets." + ) } - }; - - info!("Searching for datasets{attributes_fmt}"); - - match &filtered.len() { - 0 => make_error!("No datasets found{attributes_fmt}. Use `datasets list` command to show available datasets."), 1 => Ok(filtered.remove(0)), _ => { - let table = format_dataset_table(&filtered); - make_error!("Can download only a single dataset, but multiple datasets found{attributes_fmt}. Add more specific attributes to select one of them. Given current attributes, the candidates are:\n{table}") + make_internal_error!("Expected to find a single dataset, but multiple datasets found.") } + }?; + + if !dataset.is_cli_compatible(this_package_version()) { + warn!( + "The requested dataset '{}' with version tag '{}' is not compatible with this version of Nextclade ({}). This may cause errors and unexpected results. Please try to upgrade your Nextclade version and/or report this to dataset authors.", + dataset.path, + dataset.tag(), + this_package_version_str() + ); } -} -pub fn nextclade_dataset_get(args: &NextcladeDatasetGetArgs) -> Result<(), Report> { - let verbose = log::max_level() > LevelFilter::Info; - let mut http = HttpClient::new(&args.server, &args.proxy_config, verbose)?; - - let dataset = nextclade_dataset_http_get( - &mut http, - DatasetHttpGetParams { - name: &args.name, - reference: &args.reference, - tag: &args.tag, - }, - &args.attribute, - )?; - - if let Some(output_dir) = &args.output_dir { - dataset_dir_download(&mut http, &dataset, output_dir)?; - } - - if let Some(output_zip) = &args.output_zip { - dataset_zip_download(&mut http, &dataset, output_zip)?; - } - - Ok(()) + Ok(dataset) } -pub fn dataset_file_http_get(http: &mut HttpClient, dataset: &Dataset, filename: &str) -> Result { - let url = dataset - .files - .get(filename) - .ok_or_else(|| eyre!("File not found in the dataset: '{}'", filename))?; +pub fn dataset_file_http_get( + http: &mut HttpClient, + dataset: &Dataset, + filename: impl AsRef, +) -> Result { + let filename = filename.as_ref(); + let url = dataset.file_path(filename); let content = http .get(&url) - .wrap_err_with(|| format!("Dataset file download failed: '{url}'"))?; + .wrap_err_with(|| format!("when fetching dataset file '{filename}'"))?; let content_string = String::from_utf8(content)?; diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_dataset_list.rs b/packages_rs/nextclade-cli/src/cli/nextclade_dataset_list.rs index 2b6976cd8..420cd6f77 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_dataset_list.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_dataset_list.rs @@ -1,130 +1,88 @@ use crate::cli::nextclade_cli::NextcladeDatasetListArgs; -use crate::dataset::dataset_attributes::{format_attribute_list, parse_dataset_attributes}; use crate::dataset::dataset_download::download_datasets_index_json; use crate::dataset::dataset_table::format_dataset_table; use crate::io::http_client::HttpClient; use eyre::Report; use itertools::Itertools; use log::LevelFilter; -use nextclade::getenv; -use nextclade::io::dataset::DatasetsIndexJson; +use nextclade::io::dataset::{Dataset, DatasetsIndexJson}; use nextclade::io::json::{json_stringify, JsonPretty}; - -const THIS_VERSION: &str = getenv!("CARGO_PKG_VERSION"); +use nextclade::make_error; +use nextclade::utils::info::this_package_version; pub fn nextclade_dataset_list( NextcladeDatasetListArgs { - mut name, - mut reference, - mut tag, + name, + reference, + tag, attribute, include_incompatible, include_old, + include_deprecated, + include_experimental, + include_community, json, + only_names, server, proxy_config, }: NextcladeDatasetListArgs, ) -> Result<(), Report> { - let verbose = log::max_level() > LevelFilter::Info; - let mut http = HttpClient::new(&server, &proxy_config, verbose)?; - let DatasetsIndexJson { datasets, .. } = download_datasets_index_json(&mut http)?; - - // Parse attribute key-value pairs - let mut attributes = parse_dataset_attributes(&attribute)?; - - // Handle special attributes differently - if let Some(attr_name) = attributes.remove("name") { - name = Some(attr_name); - } - if let Some(attr_reference) = attributes.remove("reference") { - reference = attr_reference; + if include_old.is_some() { + return make_error!("The argument `--include-old` is removed.\n\nAll version tags are always listed now\n\n. Please refer to `--help` and to Nextclade documentation for more details."); } - if let Some(attr_tag) = attributes.remove("tag") { - tag = attr_tag; + + if reference.is_some() || !attribute.is_empty() { + return make_error!("The arguments `--reference` and `--attribute` are removed. Datasets are now queried by `--name` and `--tag` only.\n\nIn order to list all dataset names, type:\n\n nextclade dataset list --names-only\n\n. Please refer to `--help` and to Nextclade documentation for more details."); } - let filtered = datasets + let verbose = log::max_level() > LevelFilter::Info; + + let mut http = HttpClient::new(&server, &proxy_config, verbose)?; + let DatasetsIndexJson { collections, .. } = download_datasets_index_json(&mut http)?; + + let filtered = collections .into_iter() - .filter(|dataset| dataset.enabled) + .flat_map(|collection| collection.datasets) + .filter(Dataset::is_enabled) .filter(|dataset| -> bool { // If a concrete version `tag` is specified, we skip 'enabled', 'compatibility' and 'latest' checks - if tag == "latest" { - let is_not_old = include_old || dataset.is_latest(); - let is_compatible = include_incompatible || dataset.is_compatible(THIS_VERSION); - is_compatible && is_not_old - } else { - dataset.attributes.tag.value == tag - } - }) - // Filter by reference sequence - .filter(|dataset| { - if reference == "all" { - true - } else if reference == "default" { - dataset.attributes.reference.is_default + if let Some(tag) = tag.as_ref() { + dataset.is_tag(tag) } else { - dataset.attributes.reference.value == reference + let is_compatible = include_incompatible || dataset.is_cli_compatible(this_package_version()); + let is_not_deprecated = include_deprecated || !dataset.is_deprecated(); + let is_not_experimental = include_experimental || !dataset.is_experimental(); + let is_not_community = include_community || !dataset.is_community(); + is_compatible && is_not_deprecated && is_not_experimental && is_not_community } }) // Filter by name .filter(|dataset| { if let Some(name) = &name { &dataset.attributes.name.value == name } else {true} }) - // Filter by remaining attributes - .filter(|dataset| { - let mut should_include = true; - for (key, val) in &attributes { - let is_attr_matches = match dataset.attributes.rest_attrs.get(key) { - Some(attr) => { - if val == "default" { - attr.is_default - } else { - &attr.value == val - } - } - None => false - }; - should_include = should_include && is_attr_matches; - } - should_include - }) - .sorted_by_key(|dataset| ( - !dataset.attributes.name.is_default, - dataset.attributes.name.value.to_ascii_lowercase(), - !dataset.attributes.reference.is_default, - dataset.attributes.reference.value.to_ascii_lowercase(), - !dataset.attributes.tag.is_default, - dataset.attributes.tag.value.to_ascii_lowercase(), - )) .collect_vec(); + let names = filtered.iter().map(|dataset| &dataset.path).collect_vec(); + if json { - println!("{}", json_stringify(&filtered, JsonPretty(true))?); + let content = if only_names { + json_stringify(&names, JsonPretty(true)) + } else { + json_stringify(&filtered, JsonPretty(true)) + }?; + println!("{content}"); } else { if filtered.is_empty() { return Ok(()); } - let table = format_dataset_table(&filtered); - - let attributes_fmt = { - let attributes_fmt = format_attribute_list(&name, &reference, &tag, &attributes); - if attributes_fmt.is_empty() { - "".to_owned() - } else { - format!(", having attributes: {attributes_fmt}") - } + let content = if only_names { + names.into_iter().join("\n") + } else { + format_dataset_table(&filtered) }; - if !include_incompatible && !include_old { - println!("Showing latest dataset(s) compatible with this version of Nextclade ({THIS_VERSION}){attributes_fmt}:\n{table}"); - } else if !include_incompatible { - println!("Showing latest dataset(s){attributes_fmt}:\n{table}"); - } else if !include_old { - println!("Showing datasets compatible with this version of Nextclade ({THIS_VERSION}){attributes_fmt}:\n{table}"); - } else { - println!("Showing all datasets{attributes_fmt}:\n{table}"); - } + println!("{content}"); } Ok(()) diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs index e68188c9f..af94dac9b 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs @@ -1,200 +1,64 @@ use crate::cli::nextclade_cli::{ - NextcladeRunArgs, NextcladeRunInputArgs, NextcladeRunOtherArgs, NextcladeRunOutputArgs, + NextcladeRunArgs, NextcladeRunInputArgs, NextcladeRunOtherParams, NextcladeRunOutputArgs, }; use crate::cli::nextclade_ordered_writer::NextcladeOrderedWriter; -use crate::dataset::dataset_download::{ - dataset_dir_load, dataset_individual_files_load, dataset_str_download_and_load, dataset_zip_load, DatasetFilesContent, -}; +use crate::dataset::dataset_download::nextclade_get_inputs; use eyre::{Report, WrapErr}; -use itertools::Itertools; use log::info; -use nextclade::align::gap_open::{get_gap_open_close_scores_codon_aware, get_gap_open_close_scores_flat}; -use nextclade::align::params::AlignPairwiseParams; -use nextclade::align::seed_match2::CodonSpacedIndex; -use nextclade::alphabet::nuc::{to_nuc_seq, to_nuc_seq_replacing, Nuc}; -use nextclade::analyze::find_aa_motifs::find_aa_motifs; -use nextclade::analyze::phenotype::get_phenotype_attr_descs; use nextclade::gene::gene_map_display::gene_map_to_table_string; -use nextclade::graph::graph::{convert_auspice_tree_to_graph, convert_graph_to_auspice_tree}; +use nextclade::graph::graph::convert_graph_to_auspice_tree; use nextclade::io::fasta::{FastaReader, FastaRecord}; -use nextclade::io::fs::has_extension; use nextclade::io::json::{json_write, JsonPretty}; use nextclade::io::nextclade_csv::CsvColumnConfig; use nextclade::io::nwk_writer::nwk_write_to_file; -use nextclade::make_error; -use nextclade::run::nextclade_run_one::nextclade_run_one; -use nextclade::translate::translate_genes::Translation; -use nextclade::translate::translate_genes_ref::translate_genes_ref; -use nextclade::tree::params::TreeBuilderParams; +use nextclade::run::nextclade_wasm::{AnalysisInitialData, AnalysisOutput, Nextclade}; use nextclade::tree::tree_builder::graph_attach_new_nodes_in_place; -use nextclade::tree::tree_preprocess::graph_preprocess_in_place; use nextclade::types::outputs::NextcladeOutputs; -use std::path::PathBuf; pub struct NextcladeRecord { pub index: usize, pub seq_name: String, - pub outputs_or_err: Result<(Vec, Translation, NextcladeOutputs), Report>, -} - -pub struct DatasetFilePaths { - input_ref: PathBuf, - input_tree: PathBuf, - input_qc_config: PathBuf, - input_virus_properties: PathBuf, - input_pcr_primers: PathBuf, - input_gene_map: PathBuf, -} - -pub fn nextclade_get_inputs( - run_args: &NextcladeRunArgs, - genes: &Option>, -) -> Result { - if let Some(dataset_name) = run_args.inputs.dataset_name.as_ref() { - dataset_str_download_and_load(run_args, dataset_name, genes) - .wrap_err_with(|| format!("When downloading dataset '{dataset_name}'")) - } else if let Some(input_dataset) = run_args.inputs.input_dataset.as_ref() { - if input_dataset.is_file() && has_extension(input_dataset, "zip") { - dataset_zip_load(run_args, input_dataset, genes) - } else if input_dataset.is_dir() { - dataset_dir_load(run_args.clone(), input_dataset, genes) - } else { - make_error!( - "--input-dataset: path is invalid. \ - Expected a directory path or a zip archive file path, but got: '{input_dataset:#?}'" - ) - } - } else { - dataset_individual_files_load(run_args, genes) - } + pub outputs_or_err: Result, } pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> { info!("Command-line arguments:\n{run_args:#?}"); let NextcladeRunArgs { - inputs: - NextcladeRunInputArgs { - input_fastas, - input_dataset, - input_ref, - input_tree, - input_qc_config, - input_virus_properties, - input_pcr_primers, - input_gene_map, - genes, - .. - }, + inputs: NextcladeRunInputArgs { + input_fastas, genes, .. + }, outputs: NextcladeRunOutputArgs { - output_all, - output_basename, - output_selection, - output_fasta, - output_translations, - output_ndjson, - output_json, - output_csv, - output_tsv, output_columns_selection, + output_graph, output_tree, output_tree_nwk, - output_graph, - output_insertions, - output_errors, - include_reference, - include_nearest_node_info, - in_order, - replace_unknown, .. }, - other: NextcladeRunOtherArgs { jobs }, - tree_builder_params, - alignment_params, + params, + other_params: NextcladeRunOtherParams { jobs }, } = run_args.clone(); - let DatasetFilesContent { - ref_record, - virus_properties, - tree, - ref gene_map, - qc_config, - primers, - } = nextclade_get_inputs(&run_args, &genes)?; - - let ref_seq = &to_nuc_seq(&ref_record.seq).wrap_err("When reading reference sequence")?; - let seed_index = &CodonSpacedIndex::from_sequence(ref_seq); - - let alignment_params = { - let mut alignment_params = AlignPairwiseParams::default(); - - // Merge alignment params coming from virus_properties into alignment_params - if let Some(alignment_params_from_file) = &virus_properties.alignment_params { - alignment_params.merge_opt(alignment_params_from_file.clone()); - } - - // Merge alignment params coming from CLI arguments - alignment_params.merge_opt(run_args.alignment_params); - - alignment_params - }; - - let tree_builder_params = { - let mut tree_builder_params = TreeBuilderParams::default(); - - // Merge tree builder params coming from virus_properties into alignment_params - if let Some(tree_builder_params_from_file) = &virus_properties.tree_builder_params { - tree_builder_params.merge_opt(tree_builder_params_from_file.clone()); - } - - // Merge tree builder params coming from CLI arguments - tree_builder_params.merge_opt(run_args.tree_builder_params); + let inputs = nextclade_get_inputs(&run_args, &genes)?; + let nextclade = Nextclade::new(inputs, ¶ms)?; - tree_builder_params - }; - - info!("Alignment parameters (final):\n{alignment_params:#?}"); - info!("Tree builder parameters (final):\n{tree_builder_params:#?}"); - info!("Gene map:\n{}", gene_map_to_table_string(gene_map)?); - - let gap_open_close_nuc = &get_gap_open_close_scores_codon_aware(ref_seq, gene_map, &alignment_params); - let gap_open_close_aa = &get_gap_open_close_scores_flat(ref_seq, &alignment_params); - - let ref_translation = - &translate_genes_ref(ref_seq, gene_map, &alignment_params).wrap_err("When translating reference genes")?; - - let ref_cds_translations = ref_translation - .genes() - .flat_map(|gene| gene.cdses.values()) - .cloned() - .collect_vec(); - - let aa_motifs_ref = &find_aa_motifs(&virus_properties.aa_motifs, ref_translation)?; - - let should_keep_outputs = output_tree.is_some() || output_tree_nwk.is_some() || output_graph.is_some(); + let should_write_tree = output_tree.is_some() || output_tree_nwk.is_some() || output_graph.is_some(); let mut outputs = Vec::::new(); - let phenotype_attrs = &get_phenotype_attr_descs(&virus_properties); - - let mut graph = convert_auspice_tree_to_graph(tree)?; - graph_preprocess_in_place(&mut graph, ref_seq, ref_translation)?; - let clade_node_attrs = graph.data.meta.clade_node_attr_descs(); - - let aa_motifs_keys = &virus_properties - .aa_motifs - .iter() - .map(|desc| desc.name.clone()) - .collect_vec(); - let csv_column_config = CsvColumnConfig::new(&output_columns_selection)?; + info!("Parameters (final):\n{:#?}", &nextclade.params); + info!("Genome annotation:\n{}", gene_map_to_table_string(&nextclade.gene_map)?); + std::thread::scope(|s| { const CHANNEL_SIZE: usize = 128; let (fasta_sender, fasta_receiver) = crossbeam_channel::bounded::(CHANNEL_SIZE); let (result_sender, result_receiver) = crossbeam_channel::bounded::(CHANNEL_SIZE); + let nextclade = &nextclade; let outputs = &mut outputs; + let run_args = &run_args; s.spawn(|| { let mut reader = FastaReader::from_paths(&input_fastas).unwrap(); @@ -215,60 +79,30 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> { for _ in 0..jobs { let fasta_receiver = fasta_receiver.clone(); let result_sender = result_sender.clone(); - let gap_open_close_nuc = &gap_open_close_nuc; - let gap_open_close_aa = &gap_open_close_aa; - let alignment_params = &alignment_params; - let ref_translation = &ref_translation; - let primers = &primers; - let graph = &graph; - let qc_config = &qc_config; - let virus_properties = &virus_properties; s.spawn(move || { let result_sender = result_sender.clone(); - for FastaRecord { seq_name, seq, index } in &fasta_receiver { - info!("Processing sequence '{seq_name}'"); + for fasta_record in &fasta_receiver { + info!("Processing sequence '{}'", fasta_record.seq_name); - let outputs_or_err = if replace_unknown { - Ok(to_nuc_seq_replacing(&seq)) - } else { - to_nuc_seq(&seq) - } - .wrap_err_with(|| format!("When processing sequence #{index} '{seq_name}'")) - .and_then(|qry_seq| { - nextclade_run_one( - index, - &seq_name, - &qry_seq, - ref_seq, - seed_index, - ref_translation, - aa_motifs_ref, - gene_map, - primers, - graph, - qc_config, - virus_properties, - gap_open_close_nuc, - gap_open_close_aa, - alignment_params, - include_nearest_node_info, + let outputs_or_err = nextclade.run(&fasta_record).wrap_err_with(|| { + format!( + "When processing sequence #{} '{}'", + fasta_record.index, fasta_record.seq_name ) }); - let record = NextcladeRecord { - index, - seq_name, - outputs_or_err, - }; - // Important: **all** records should be sent into this channel, without skipping. // In in-order mode, writer that receives from this channel expects a contiguous stream of indices. Gaps in // the indices will cause writer to stall waiting for the missing index and the buffering queue to grow. Any // filtering of records should be done in the writer, instead of here. result_sender - .send(record) + .send(NextcladeRecord { + index: fasta_record.index, + seq_name: fasta_record.seq_name, + outputs_or_err, + }) .wrap_err("When sending NextcladeRecord") .unwrap(); } @@ -278,36 +112,41 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> { } let writer = s.spawn(move || { - let mut output_writer = NextcladeOrderedWriter::new( + let nextclade = &nextclade; + + let AnalysisInitialData { + genome_size, gene_map, - clade_node_attrs, - phenotype_attrs, - aa_motifs_keys, - &output_fasta, - &output_json, - &output_ndjson, - &output_csv, - &output_tsv, - &output_insertions, - &output_errors, - &output_translations, + clade_node_attr_key_descs, + phenotype_attr_descs, + aa_motif_keys, + .. + } = nextclade.get_initial_data(); + + let mut output_writer = NextcladeOrderedWriter::new( + &nextclade.gene_map, + clade_node_attr_key_descs, + phenotype_attr_descs, + aa_motif_keys, &csv_column_config, - in_order, + &run_args.outputs, + &nextclade.params, ) .wrap_err("When creating output writer") .unwrap(); - if include_reference { + if nextclade.params.general.include_reference { output_writer - .write_ref(&ref_record, ref_translation) + .write_ref(&nextclade.ref_record, &nextclade.ref_translation) .wrap_err("When writing output record for ref sequence") .unwrap(); } for record in result_receiver { - if should_keep_outputs { - if let Ok((_, _, nextclade_outputs)) = &record.outputs_or_err { - outputs.push(nextclade_outputs.clone()); + if should_write_tree { + // Save analysis results if they will be needed later + if let Ok(AnalysisOutput { analysis_result, .. }) = &record.outputs_or_err { + outputs.push(analysis_result.clone()); } } @@ -319,20 +158,25 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> { }); }); - if output_tree.is_some() || output_tree_nwk.is_some() || output_graph.is_some() { - graph_attach_new_nodes_in_place(&mut graph, outputs, ref_seq.len(), &tree_builder_params)?; + if should_write_tree { + let Nextclade { + ref_seq, params, graph, .. + } = nextclade; + if let Some(mut graph) = graph { + graph_attach_new_nodes_in_place(&mut graph, outputs, ref_seq.len(), ¶ms.tree_builder)?; - if let Some(output_tree) = output_tree { - let tree = convert_graph_to_auspice_tree(&graph)?; - json_write(output_tree, &tree, JsonPretty(true))?; - } + if let Some(output_tree) = output_tree { + let tree = convert_graph_to_auspice_tree(&graph)?; + json_write(output_tree, &tree, JsonPretty(true))?; + } - if let Some(output_tree_nwk) = output_tree_nwk { - nwk_write_to_file(output_tree_nwk, &graph)?; - } + if let Some(output_tree_nwk) = output_tree_nwk { + nwk_write_to_file(output_tree_nwk, &graph)?; + } - if let Some(output_graph) = run_args.outputs.output_graph { - json_write(output_graph, &graph, JsonPretty(true))?; + if let Some(output_graph) = run_args.outputs.output_graph { + json_write(output_graph, &graph, JsonPretty(true))?; + } } } diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs b/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs index b1d36abc3..6a145b3d0 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs @@ -1,3 +1,4 @@ +use crate::cli::nextclade_cli::NextcladeRunOutputArgs; use crate::cli::nextclade_loop::NextcladeRecord; use eyre::{Report, WrapErr}; use itertools::Itertools; @@ -5,12 +6,12 @@ use log::{info, warn}; use nextclade::alphabet::nuc::from_nuc_seq; use nextclade::analyze::virus_properties::PhenotypeAttrDesc; use nextclade::gene::gene_map::GeneMap; -use nextclade::io::errors_csv::ErrorsCsvWriter; use nextclade::io::fasta::{FastaPeptideWriter, FastaRecord, FastaWriter}; -use nextclade::io::insertions_csv::InsertionsCsvWriter; use nextclade::io::ndjson::NdjsonFileWriter; use nextclade::io::nextclade_csv::{CsvColumnConfig, NextcladeResultsCsvFileWriter}; use nextclade::io::results_json::ResultsJsonWriter; +use nextclade::run::nextclade_wasm::AnalysisOutput; +use nextclade::run::params::NextcladeInputParams; use nextclade::translate::translate_genes::Translation; use nextclade::tree::tree::CladeNodeAttrKeyDesc; use nextclade::types::outputs::NextcladeOutputs; @@ -18,55 +19,41 @@ use nextclade::utils::error::report_to_string; use nextclade::utils::option::OptionMapRefFallible; use std::collections::HashMap; use std::hash::Hasher; -use std::path::PathBuf; /// Writes output files, potentially preserving the initial order of records (same as in the inputs) -pub struct NextcladeOrderedWriter<'a> { +pub struct NextcladeOrderedWriter { fasta_writer: Option, fasta_peptide_writer: Option, output_json_writer: Option, output_ndjson_writer: Option, output_csv_writer: Option, output_tsv_writer: Option, - insertions_csv_writer: Option, - errors_csv_writer: Option>, expected_index: usize, queue: HashMap, in_order: bool, } -impl<'a> NextcladeOrderedWriter<'a> { +impl NextcladeOrderedWriter { pub fn new( - gene_map: &'a GeneMap, + gene_map: &GeneMap, clade_node_attr_key_descs: &[CladeNodeAttrKeyDesc], phenotype_attr_key_desc: &[PhenotypeAttrDesc], aa_motifs_keys: &[String], - output_fasta: &Option, - output_json: &Option, - output_ndjson: &Option, - output_csv: &Option, - output_tsv: &Option, - output_insertions: &Option, - output_errors: &Option, - output_translations: &Option, csv_column_config: &CsvColumnConfig, - in_order: bool, + output_params: &NextcladeRunOutputArgs, + params: &NextcladeInputParams, ) -> Result { - let fasta_writer = output_fasta.map_ref_fallible(FastaWriter::from_path)?; + let fasta_writer = output_params.output_fasta.map_ref_fallible(FastaWriter::from_path)?; - let fasta_peptide_writer = output_translations + let fasta_peptide_writer = output_params + .output_translations .map_ref_fallible(|output_translations| FastaPeptideWriter::new(gene_map, output_translations))?; - let insertions_csv_writer = output_insertions.map_ref_fallible(InsertionsCsvWriter::new)?; - - let errors_csv_writer = - output_errors.map_ref_fallible(|output_errors| ErrorsCsvWriter::new(gene_map, output_errors))?; - - let output_json_writer = output_json.map_ref_fallible(|output_json| { + let output_json_writer = output_params.output_json.map_ref_fallible(|output_json| { ResultsJsonWriter::new(output_json, clade_node_attr_key_descs, phenotype_attr_key_desc) })?; - let output_ndjson_writer = output_ndjson.map_ref_fallible(NdjsonFileWriter::new)?; + let output_ndjson_writer = output_params.output_ndjson.map_ref_fallible(NdjsonFileWriter::new)?; let clade_node_attr_keys = clade_node_attr_key_descs .iter() @@ -78,7 +65,7 @@ impl<'a> NextcladeOrderedWriter<'a> { .map(|desc| desc.name.clone()) .collect_vec(); - let output_csv_writer = output_csv.map_ref_fallible(|output_csv| { + let output_csv_writer = output_params.output_csv.map_ref_fallible(|output_csv| { NextcladeResultsCsvFileWriter::new( output_csv, b';', @@ -89,7 +76,7 @@ impl<'a> NextcladeOrderedWriter<'a> { ) })?; - let output_tsv_writer = output_tsv.map_ref_fallible(|output_tsv| { + let output_tsv_writer = output_params.output_tsv.map_ref_fallible(|output_tsv| { NextcladeResultsCsvFileWriter::new( output_tsv, b'\t', @@ -107,11 +94,9 @@ impl<'a> NextcladeOrderedWriter<'a> { output_ndjson_writer, output_csv_writer, output_tsv_writer, - insertions_csv_writer, - errors_csv_writer, expected_index: 0, queue: HashMap::::new(), - in_order, + in_order: params.general.in_order, }) } @@ -141,7 +126,11 @@ impl<'a> NextcladeOrderedWriter<'a> { } = record; match outputs_or_err { - Ok((qry_seq_stripped, translation, nextclade_outputs)) => { + Ok(AnalysisOutput { + query, + translation, + analysis_result, + }) => { let NextcladeOutputs { warnings, insertions, @@ -149,10 +138,10 @@ impl<'a> NextcladeOrderedWriter<'a> { missing_genes, is_reverse_complement, .. - } = &nextclade_outputs; + } = &analysis_result; if let Some(fasta_writer) = &mut self.fasta_writer { - fasta_writer.write(&seq_name, &from_nuc_seq(&qry_seq_stripped), *is_reverse_complement)?; + fasta_writer.write(&seq_name, &from_nuc_seq(&query), *is_reverse_complement)?; } if let Some(fasta_peptide_writer) = &mut self.fasta_peptide_writer { @@ -161,32 +150,24 @@ impl<'a> NextcladeOrderedWriter<'a> { } } - if let Some(insertions_csv_writer) = &mut self.insertions_csv_writer { - insertions_csv_writer.write(&seq_name, insertions, aa_insertions)?; - } - for warning in warnings { info!("In sequence #{index} '{seq_name}': {}", warning.warning); } - if let Some(errors_csv_writer) = &mut self.errors_csv_writer { - errors_csv_writer.write_aa_errors(&seq_name, warnings, missing_genes)?; - } - if let Some(output_csv_writer) = &mut self.output_csv_writer { - output_csv_writer.write(&nextclade_outputs)?; + output_csv_writer.write(&analysis_result)?; } if let Some(output_tsv_writer) = &mut self.output_tsv_writer { - output_tsv_writer.write(&nextclade_outputs)?; + output_tsv_writer.write(&analysis_result)?; } if let Some(output_ndjson_writer) = &mut self.output_ndjson_writer { - output_ndjson_writer.write(&nextclade_outputs)?; + output_ndjson_writer.write(&analysis_result)?; } if let Some(output_json_writer) = &mut self.output_json_writer { - output_json_writer.write(nextclade_outputs); + output_json_writer.write(analysis_result); } } Err(report) => { @@ -194,12 +175,6 @@ impl<'a> NextcladeOrderedWriter<'a> { warn!( "In sequence #{index} '{seq_name}': {cause}. Note that this sequence will not be included in the results." ); - if let Some(insertions_csv_writer) = &mut self.insertions_csv_writer { - insertions_csv_writer.write(&seq_name, &[], &[])?; - } - if let Some(errors_csv_writer) = &mut self.errors_csv_writer { - errors_csv_writer.write_nuc_error(&seq_name, &cause)?; - } if let Some(output_csv_writer) = &mut self.output_csv_writer { output_csv_writer.write_nuc_error(index, &seq_name, &cause)?; } @@ -267,7 +242,7 @@ impl<'a> NextcladeOrderedWriter<'a> { } } -impl<'a> Drop for NextcladeOrderedWriter<'a> { +impl Drop for NextcladeOrderedWriter { fn drop(&mut self) { self.finish().wrap_err("When finalizing output writer").unwrap(); } diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_read_annotation.rs b/packages_rs/nextclade-cli/src/cli/nextclade_read_annotation.rs new file mode 100644 index 000000000..f4182801e --- /dev/null +++ b/packages_rs/nextclade-cli/src/cli/nextclade_read_annotation.rs @@ -0,0 +1,54 @@ +use crate::cli::nextclade_cli::NextcladeReadAnnotationArgs; +use eyre::Report; +use nextclade::features::feature_tree::FeatureTree; +use nextclade::gene::gene_map::GeneMap; +use nextclade::gene::gene_map_display::gene_map_to_table_string; +use nextclade::io::file::open_file_or_stdin; +use nextclade::io::json::{json_or_yaml_write, json_stringify, JsonPretty}; +use std::io::Read; + +pub fn nextclade_read_annotation(args: &NextcladeReadAnnotationArgs) -> Result<(), Report> { + let content = { + let mut content = String::new(); + open_file_or_stdin(&args.input_annotation)?.read_to_string(&mut content)?; + content + }; + + if args.feature_tree { + handle_feature_tree(args, &content) + } else { + handle_genome_annotation(args, &content) + } +} + +fn handle_genome_annotation(args: &NextcladeReadAnnotationArgs, content: &str) -> Result<(), Report> { + let data = GeneMap::from_str(content)?; + + if args.json { + println!("{}\n", json_stringify(&data, JsonPretty(true))?); + } else { + println!("{}", gene_map_to_table_string(&data)?); + } + + if let Some(output) = &args.output { + json_or_yaml_write(output, &data)?; + } + + Ok(()) +} + +fn handle_feature_tree(args: &NextcladeReadAnnotationArgs, content: &str) -> Result<(), Report> { + let data = FeatureTree::from_gff3_str(content)?; + + if args.json { + println!("{}\n", json_stringify(&data, JsonPretty(true))?); + } else { + println!("{}", data.to_pretty_string()?); + } + + if let Some(output) = &args.output { + json_or_yaml_write(output, &data)?; + } + + Ok(()) +} diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs b/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs new file mode 100644 index 000000000..3d864617e --- /dev/null +++ b/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs @@ -0,0 +1,410 @@ +use crate::cli::nextclade_cli::{NextcladeRunOtherParams, NextcladeSortArgs}; +use crate::dataset::dataset_download::download_datasets_index_json; +use crate::io::http_client::HttpClient; +use eyre::{Report, WrapErr}; +use itertools::Itertools; +use log::{trace, LevelFilter}; +use nextclade::io::csv::CsvStructFileWriter; +use nextclade::io::fasta::{FastaReader, FastaRecord, FastaWriter}; +use nextclade::io::fs::path_to_string; +use nextclade::make_error; +use nextclade::sort::minimizer_index::{MinimizerIndexJson, MINIMIZER_INDEX_ALGO_VERSION}; +use nextclade::sort::minimizer_search::{run_minimizer_search, MinimizerSearchRecord}; +use nextclade::utils::option::{OptionMapMutFallible, OptionMapRefFallible}; +use nextclade::utils::string::truncate; +use ordered_float::OrderedFloat; +use owo_colors::OwoColorize; +use schemars::JsonSchema; +use serde::Serialize; +use std::collections::btree_map::Entry::{Occupied, Vacant}; +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use tinytemplate::TinyTemplate; + +pub fn nextclade_seq_sort(args: &NextcladeSortArgs) -> Result<(), Report> { + check_args(args)?; + + let NextcladeSortArgs { + server, + proxy_config, + input_minimizer_index_json, + .. + } = args; + + let verbose = log::max_level() >= LevelFilter::Info; + + let minimizer_index = if let Some(input_minimizer_index_json) = &input_minimizer_index_json { + // If a file is provided, use data from it + MinimizerIndexJson::from_path(input_minimizer_index_json) + } else { + // Otherwise fetch from dataset server + let mut http = HttpClient::new(server, proxy_config, verbose)?; + let index = download_datasets_index_json(&mut http)?; + let minimizer_index_path = index + .minimizer_index + .iter() + .find(|minimizer_index| MINIMIZER_INDEX_ALGO_VERSION == minimizer_index.version) + .map(|minimizer_index| &minimizer_index.path); + + if let Some(minimizer_index_path) = minimizer_index_path { + let minimizer_index_str = http.get(minimizer_index_path)?; + MinimizerIndexJson::from_str(String::from_utf8(minimizer_index_str)?) + } else { + let server_versions = index + .minimizer_index + .iter() + .map(|minimizer_index| format!("'{}'", minimizer_index.version)) + .join(","); + let server_versions = if server_versions.is_empty() { + "none available".to_owned() + } else { + format!(": {server_versions}") + }; + + make_error!("No compatible reference minimizer index data is found for this dataset sever. Cannot proceed. \n\nThis version of Nextclade supports index versions up to '{}', but the server has {}.\n\nTry to to upgrade Nextclade to the latest version and/or contact dataset server maintainers.", MINIMIZER_INDEX_ALGO_VERSION, server_versions) + } + }?; + + run(args, &minimizer_index, verbose) +} + +pub fn run(args: &NextcladeSortArgs, minimizer_index: &MinimizerIndexJson, verbose: bool) -> Result<(), Report> { + let NextcladeSortArgs { + input_fastas, + search_params, + other_params: NextcladeRunOtherParams { jobs }, + .. + } = args; + + std::thread::scope(|s| { + const CHANNEL_SIZE: usize = 128; + let (fasta_sender, fasta_receiver) = crossbeam_channel::bounded::(CHANNEL_SIZE); + let (result_sender, result_receiver) = crossbeam_channel::bounded::(CHANNEL_SIZE); + + s.spawn(|| { + let mut reader = FastaReader::from_paths(input_fastas).unwrap(); + loop { + let mut record = FastaRecord::default(); + reader.read(&mut record).unwrap(); + if record.is_empty() { + break; + } + fasta_sender + .send(record) + .wrap_err("When sending a FastaRecord") + .unwrap(); + } + drop(fasta_sender); + }); + + for _ in 0..*jobs { + let fasta_receiver = fasta_receiver.clone(); + let result_sender = result_sender.clone(); + + s.spawn(move || { + let result_sender = result_sender.clone(); + + for fasta_record in &fasta_receiver { + trace!("Processing sequence '{}'", fasta_record.seq_name); + + let result = run_minimizer_search(&fasta_record, minimizer_index, search_params) + .wrap_err_with(|| { + format!( + "When processing sequence #{} '{}'", + fasta_record.index, fasta_record.seq_name + ) + }) + .unwrap(); + + result_sender + .send(MinimizerSearchRecord { fasta_record, result }) + .wrap_err("When sending minimizer record into the channel") + .unwrap(); + } + + drop(result_sender); + }); + } + + let writer = s.spawn(move || { + writer_thread(args, result_receiver, verbose).unwrap(); + }); + }); + + Ok(()) +} + +#[derive(Clone, Default, Debug, Serialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +struct SeqSortCsvEntry<'a> { + seq_name: &'a str, + dataset: Option<&'a str>, + score: Option, + num_hits: Option, +} + +fn writer_thread( + args: &NextcladeSortArgs, + result_receiver: crossbeam_channel::Receiver, + verbose: bool, +) -> Result<(), Report> { + let NextcladeSortArgs { + output_dir, + output_path, + output_results_tsv, + .. + } = args; + + let template = output_path.map_ref_fallible(move |output_path| -> Result { + let mut template = TinyTemplate::new(); + template + .add_template("output", output_path) + .wrap_err_with(|| format!("When parsing template: '{output_path}'"))?; + Ok(template) + })?; + + let mut writers = BTreeMap::new(); + let mut stats = StatsPrinter::new(verbose); + + let mut results_csv = + output_results_tsv.map_ref_fallible(|output_results_tsv| CsvStructFileWriter::new(output_results_tsv, b'\t'))?; + + for record in result_receiver { + stats.print_seq(&record); + + let datasets = &record.result.datasets; + + if datasets.is_empty() { + results_csv.map_mut_fallible(|results_csv| { + results_csv.write(&SeqSortCsvEntry { + seq_name: &record.fasta_record.seq_name, + dataset: None, + score: None, + num_hits: None, + }) + })?; + } + + for dataset in datasets { + results_csv.map_mut_fallible(|results_csv| { + results_csv.write(&SeqSortCsvEntry { + seq_name: &record.fasta_record.seq_name, + dataset: Some(&dataset.name), + score: Some(dataset.score), + num_hits: Some(dataset.n_hits), + }) + })?; + } + + let names = datasets + .iter() + .map(|dataset| get_all_prefix_names(&dataset.name)) + .collect::>, Report>>()? + .into_iter() + .flatten() + .unique(); + + for name in names { + let filepath = get_filepath(&name, &template, output_dir)?; + + if let Some(filepath) = filepath { + let writer = get_or_insert_writer(&mut writers, filepath)?; + writer.write(&record.fasta_record.seq_name, &record.fasta_record.seq, false)?; + } + } + } + + stats.finish(); + + Ok(()) +} + +pub fn get_all_prefix_names(name: impl AsRef) -> Result, Report> { + name + .as_ref() + .split('/') + .scan(PathBuf::new(), |name, component| { + *name = name.join(component); + Some(name.clone()) + }) + .unique() + .map(path_to_string) + .collect() +} + +struct StatsPrinter { + enabled: bool, + stats: BTreeMap, + n_undetected: usize, +} + +impl StatsPrinter { + pub fn new(enabled: bool) -> Self { + if enabled { + println!("Suggested datasets for each sequence"); + println!("{}┐", "─".repeat(110)); + println!( + "{:^40} │ {:^40} │ {:^10} │ {:^10} │", + "Sequence name", "Dataset", "Score", "Num. hits" + ); + println!("{}┤", "─".repeat(110)); + } + + Self { + enabled, + stats: BTreeMap::new(), + n_undetected: 0, + } + } + + pub fn print_seq(&mut self, record: &MinimizerSearchRecord) { + if !self.enabled { + return; + } + + let datasets = record + .result + .datasets + .iter() + .sorted_by_key(|dataset| -OrderedFloat(dataset.score)) + .collect_vec(); + + print!("{:<40}", truncate(&record.fasta_record.seq_name, 40)); + + if datasets.is_empty() { + println!(" │ {:40} │ {:>10.3} │ {:>10} │", "undetected".red(), "", ""); + self.n_undetected += 1; + } + + for (i, dataset) in datasets.into_iter().enumerate() { + let name = &dataset.name; + *self.stats.entry(name.clone()).or_insert(1) += 1; + + if i != 0 { + print!("{:<40}", ""); + } + + println!( + " │ {:40} │ {:>10.3} │ {:>10} │", + &truncate(&dataset.name, 40), + &dataset.score, + &dataset.n_hits, + ); + } + + println!("{}┤", "─".repeat(110)); + } + + pub fn finish(&self) { + if !self.enabled { + return; + } + + println!("\n\nSuggested datasets"); + println!("{}┐", "─".repeat(67)); + println!("{:^40} │ {:^10} │ {:^10} │", "Dataset", "Num. seq", "Percent"); + println!("{}┤", "─".repeat(67)); + + let total_seq = self.stats.values().sum::() + self.n_undetected; + let stats = self + .stats + .iter() + .sorted_by_key(|(name, n_seq)| (-(**n_seq as isize), (*name).clone())); + + for (name, n_seq) in stats { + println!( + "{:<40} │ {:>10} │ {:>9.3}% │", + name, + n_seq, + 100.0 * (*n_seq as f64 / total_seq as f64) + ); + } + + if self.n_undetected > 0 { + println!("{}┤", "─".repeat(67)); + println!( + "{:<40} │ {:>10} │ {:>10} │", + "undetected".red(), + self.n_undetected.red(), + format!("{:>9.3}%", 100.0 * (self.n_undetected as f64 / total_seq as f64)).red() + ); + } + + println!("{}┤", "─".repeat(67)); + println!( + "{:>40} │ {:>10} │ {:>10} │", + "total".bold(), + total_seq.bold(), + format!("{:>9.3}%", 100.0).bold() + ); + println!("{}┘", "─".repeat(67)); + } +} + +fn get_or_insert_writer( + writers: &mut BTreeMap, + filepath: impl AsRef, +) -> Result<&mut FastaWriter, Report> { + Ok(match writers.entry(filepath.as_ref().to_owned()) { + Occupied(e) => e.into_mut(), + Vacant(e) => e.insert(FastaWriter::from_path(filepath)?), + }) +} + +fn get_filepath( + name: &str, + tt: &Option, + output_dir: &Option, +) -> Result, Report> { + Ok(match (&tt, output_dir) { + (Some(tt), None) => { + let filepath_str = tt + .render("output", &OutputTemplateContext { name }) + .wrap_err("When rendering output path template")?; + + Some(PathBuf::from_str(&filepath_str).wrap_err_with(|| format!("Invalid output path: '{filepath_str}'"))?) + } + (None, Some(output_dir)) => Some(output_dir.join(name).join("sequences.fasta")), + _ => None, + }) +} + +#[derive(Serialize)] +struct OutputTemplateContext<'a> { + name: &'a str, +} + +fn check_args(args: &NextcladeSortArgs) -> Result<(), Report> { + let NextcladeSortArgs { + output_dir, + output_path: output, + .. + } = args; + + if output.is_some() && output_dir.is_some() { + return make_error!( + "The arguments `--output-dir` and `--output` cannot be used together. Remove one or the other." + ); + } + + if let Some(output) = output { + if !output.contains("{name}") { + return make_error!( + r#" +Expected `--output` argument to contain a template string containing template variable {{name}} (with curly braces), but received: + + {output} + +Make sure the variable is not substituted by your shell, programming language or workflow manager. Apply proper escaping as needed. +Example for bash shell: + + --output='outputs/{{name}}/sorted.fasta.gz' + + "# + ); + } + } + + Ok(()) +} diff --git a/packages_rs/nextclade-cli/src/dataset/dataset_attributes.rs b/packages_rs/nextclade-cli/src/dataset/dataset_attributes.rs deleted file mode 100644 index 3045d2bb3..000000000 --- a/packages_rs/nextclade-cli/src/dataset/dataset_attributes.rs +++ /dev/null @@ -1,57 +0,0 @@ -use eyre::{Report, WrapErr}; -use indexmap::IndexMap; -use itertools::Itertools; -use lazy_static::lazy_static; -use nextclade::make_error; -use regex::Regex; - -pub fn parse_dataset_attributes(attribute_strs: &[String]) -> Result, Report> { - attribute_strs - .iter() - .map(|attr| -> Result<(String, String), Report> { parse_dataset_attribute(attr) }) - .collect::, Report>>() -} - -const DATASET_ATTR_REGEX: &str = r#"(['"]?(?P.+)['"]?=['"]?(?P.+)['"]?)"#; - -pub fn parse_dataset_attribute(s: &str) -> Result<(String, String), Report> { - lazy_static! { - static ref RE: Regex = Regex::new(DATASET_ATTR_REGEX) - .wrap_err_with(|| format!("When compiling regular expression for dataset attributes: '{DATASET_ATTR_REGEX}'")) - .unwrap(); - } - - if let Some(captures) = RE.captures(s) { - return match (captures.name("key"), captures.name("val")) { - (Some(key), Some(val)) => { - let key: String = key.as_str().to_owned(); - let val: String = val.as_str().to_owned(); - Ok((key, val)) - } - _ => make_error!("Unable to parse dataset attribute: '{s}'"), - }; - } - make_error!("Unable to parse dataset attribute: '{s}'") -} - -pub fn format_attribute_list( - name: &Option, - reference: &str, - tag: &str, - attributes: &IndexMap, -) -> String { - let mut attributes_fmt = IndexMap::::new(); - - if let Some(name) = name { - attributes_fmt.insert("name".to_owned(), name.clone()); - } - attributes_fmt.insert("reference".to_owned(), reference.to_owned()); - attributes_fmt.insert("tag".to_owned(), tag.to_owned()); - attributes_fmt.extend(attributes.clone().into_iter()); - - attributes_fmt - .into_iter() - .map(|(key, val)| format!("{key}='{val}'")) - .collect_vec() - .join(", ") -} diff --git a/packages_rs/nextclade-cli/src/dataset/dataset_download.rs b/packages_rs/nextclade-cli/src/dataset/dataset_download.rs index f58b02d23..cbf88065b 100644 --- a/packages_rs/nextclade-cli/src/dataset/dataset_download.rs +++ b/packages_rs/nextclade-cli/src/dataset/dataset_download.rs @@ -1,68 +1,76 @@ -use crate::cli::nextclade_cli::NextcladeRunArgs; -use crate::cli::nextclade_dataset_get::{dataset_file_http_get, nextclade_dataset_http_get, DatasetHttpGetParams}; +use crate::cli::nextclade_cli::{NextcladeRunArgs, NextcladeRunInputArgs}; +use crate::cli::nextclade_dataset_get::{dataset_file_http_get, dataset_http_get}; use crate::io::http_client::{HttpClient, ProxyConfig}; -use eyre::{Report, WrapErr}; +use eyre::{eyre, ContextCompat, Report, WrapErr}; use itertools::Itertools; use log::LevelFilter; -use nextclade::analyze::pcr_primers::PcrPrimer; -use nextclade::analyze::virus_properties::VirusProperties; +use nextclade::analyze::virus_properties::{LabelledMutationsConfig, VirusProperties}; use nextclade::gene::gene_map::{filter_gene_map, GeneMap}; -use nextclade::io::dataset::{Dataset, DatasetsIndexJson}; -use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str, FastaRecord}; -use nextclade::io::fs::absolute_path; -use nextclade::io::json::json_parse_bytes; -use nextclade::make_error; -use nextclade::qc::qc_config::QcConfig; +use nextclade::io::dataset::{Dataset, DatasetAttributeValue, DatasetAttributes, DatasetFiles, DatasetsIndexJson}; +use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str}; +use nextclade::io::file::create_file_or_stdout; +use nextclade::io::fs::{ensure_dir, has_extension, read_file_to_string}; +use nextclade::run::nextclade_wasm::NextcladeParams; use nextclade::tree::tree::AuspiceTree; -use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; -use std::fs; +use nextclade::utils::option::OptionMapRefFallible; +use nextclade::{make_error, make_internal_error, o}; +use rayon::iter::ParallelIterator; +use std::collections::BTreeMap; use std::fs::File; -use std::io::{BufReader, Read, Seek}; -use std::path::Path; +use std::io::{BufReader, Read, Seek, Write}; +use std::path::{Path, PathBuf}; use std::str::FromStr; use zip::ZipArchive; +const PATHOGEN_JSON: &str = "pathogen.json"; + +pub fn nextclade_get_inputs( + run_args: &NextcladeRunArgs, + genes: &Option>, +) -> Result { + if let Some(dataset_name) = run_args.inputs.dataset_name.as_ref() { + dataset_str_download_and_load(run_args, genes) + .wrap_err_with(|| format!("When downloading dataset '{dataset_name}'")) + } else if let Some(input_dataset) = run_args.inputs.input_dataset.as_ref() { + if input_dataset.is_file() && has_extension(input_dataset, "zip") { + dataset_zip_load(run_args, input_dataset, genes) + .wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}")) + } else if input_dataset.is_dir() { + dataset_dir_load(run_args, input_dataset, genes) + .wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}")) + } else { + make_error!( + "--input-dataset: path is invalid. \ + Expected a directory path or a zip archive file path, but got: '{input_dataset:#?}'" + ) + } + } else { + dataset_individual_files_load(run_args, genes) + } +} + #[inline] pub fn download_datasets_index_json(http: &mut HttpClient) -> Result { - json_parse_bytes(http.get(&"/index_v2.json")?.as_slice()) + let data_bytes = http.get("/index.json")?; + let data_str = String::from_utf8(data_bytes)?; + DatasetsIndexJson::from_str(data_str) } -pub fn dataset_dir_download(http: &mut HttpClient, dataset: &Dataset, output_dir: &Path) -> Result<(), Report> { - let output_dir = &absolute_path(output_dir)?; - fs::create_dir_all(output_dir).wrap_err_with(|| format!("When creating directory '{output_dir:#?}'"))?; - - dataset - .files - .par_iter() - .map(|(filename, url)| -> Result<(), Report> { - let output_file_path = output_dir.join(filename); - let content = http.get(url)?; - fs::write(output_file_path, content)?; - Ok(()) - }) - .collect::>() - .wrap_err_with(|| format!("When downloading dataset {dataset:#?}")) +pub fn dataset_zip_fetch(http: &mut HttpClient, dataset: &Dataset) -> Result, Report> { + http + .get(&dataset.file_path("dataset.zip")) + .wrap_err_with(|| format!("When fetching zip file for dataset '{}'", dataset.path)) } pub fn dataset_zip_download(http: &mut HttpClient, dataset: &Dataset, output_file_path: &Path) -> Result<(), Report> { - if let Some(parent_dir) = output_file_path.parent() { - let parent_dir = &absolute_path(parent_dir)?; - fs::create_dir_all(parent_dir) - .wrap_err_with(|| format!("When creating parent directory '{parent_dir:#?}' for file '{output_file_path:#?}'"))?; - } + let mut file = + create_file_or_stdout(output_file_path).wrap_err_with(|| format!("When opening file {output_file_path:?}"))?; - let content = http.get(&dataset.zip_bundle)?; - fs::write(output_file_path, content) - .wrap_err_with(|| format!("When writing downloaded dataset zip file to {output_file_path:#?}")) -} + let content = dataset_zip_fetch(http, dataset)?; -pub struct DatasetFilesContent { - pub ref_record: FastaRecord, - pub virus_properties: VirusProperties, - pub tree: AuspiceTree, - pub gene_map: GeneMap, - pub qc_config: QcConfig, - pub primers: Vec, + file + .write_all(&content) + .wrap_err_with(|| format!("When writing downloaded dataset zip file to {output_file_path:#?}")) } pub fn zip_read_str(zip: &mut ZipArchive, name: &str) -> Result { @@ -71,159 +79,246 @@ pub fn zip_read_str(zip: &mut ZipArchive, name: &str) -> Resu Ok(s) } +pub fn read_from_path_or_zip( + filepath: &Option>, + zip: &mut ZipArchive>, + zip_filename: &str, +) -> Result, Report> { + if let Some(filepath) = filepath { + return Ok(Some(read_file_to_string(filepath)?)); + } + Ok(zip_read_str(zip, zip_filename).ok()) +} + pub fn dataset_zip_load( run_args: &NextcladeRunArgs, dataset_zip: impl AsRef, genes: &Option>, -) -> Result { +) -> Result { let file = File::open(dataset_zip)?; let buf_file = BufReader::new(file); let mut zip = ZipArchive::new(buf_file)?; - let ref_record = run_args.inputs.input_ref.as_ref().map_or_else( - || read_one_fasta_str(&zip_read_str(&mut zip, "reference.fasta")?), - read_one_fasta, - )?; - - let tree = run_args.inputs.input_tree.as_ref().map_or_else( - || AuspiceTree::from_str(&zip_read_str(&mut zip, "tree.json")?), - AuspiceTree::from_path, - )?; - - let qc_config = run_args.inputs.input_qc_config.as_ref().map_or_else( - || QcConfig::from_str(&zip_read_str(&mut zip, "qc.json")?), - QcConfig::from_path, - )?; - - let virus_properties = run_args.inputs.input_virus_properties.as_ref().map_or_else( - || VirusProperties::from_str(&zip_read_str(&mut zip, "virus_properties.json")?), - VirusProperties::from_path, - )?; - - let primers = run_args.inputs.input_pcr_primers.as_ref().map_or_else( - || PcrPrimer::from_str(&zip_read_str(&mut zip, "primers.csv")?, &ref_record.seq), - |input_pcr_primers| PcrPrimer::from_path(input_pcr_primers, &ref_record.seq), - )?; - - let gene_map = run_args.inputs.input_gene_map.as_ref().map_or_else( - || filter_gene_map(Some(GeneMap::from_str(zip_read_str(&mut zip, "genemap.gff")?)?), genes), - |input_gene_map| filter_gene_map(Some(GeneMap::from_file(input_gene_map)?), genes), - )?; - - Ok(DatasetFilesContent { + let virus_properties = read_from_path_or_zip(&run_args.inputs.input_pathogen_json, &mut zip, "pathogen.json")? + .map_ref_fallible(VirusProperties::from_str) + .wrap_err("When reading pathogen JSON from dataset")? + .ok_or_else(|| eyre!("Pathogen JSON must always be present in the dataset but not found."))?; + + let ref_record = read_from_path_or_zip(&run_args.inputs.input_ref, &mut zip, &virus_properties.files.reference)? + .map_ref_fallible(read_one_fasta_str) + .wrap_err("When reading reference sequence from dataset")? + .ok_or_else(|| eyre!("Reference sequence must always be present in the dataset but not found."))?; + + let gene_map = read_from_path_or_zip(&run_args.inputs.input_annotation, &mut zip, "genome_annotation.gff3")? + .map_ref_fallible(GeneMap::from_str) + .wrap_err("When reading genome annotation from dataset")? + .map(|gene_map| filter_gene_map(gene_map, genes)) + .unwrap_or_default(); + + let tree = read_from_path_or_zip(&run_args.inputs.input_tree, &mut zip, "tree.json")? + .map_ref_fallible(AuspiceTree::from_str) + .wrap_err("When reading reference tree JSON from dataset")?; + + Ok(NextcladeParams { ref_record, - virus_properties, - tree, gene_map, - qc_config, - primers, + tree, + virus_properties, }) } -#[rustfmt::skip] +pub fn dataset_dir_download(http: &mut HttpClient, dataset: &Dataset, output_dir: &Path) -> Result<(), Report> { + let mut content = dataset_zip_fetch(http, dataset)?; + let mut reader = std::io::Cursor::new(content.as_mut_slice()); + let mut zip = ZipArchive::new(&mut reader)?; + + ensure_dir(output_dir).wrap_err_with(|| format!("When creating directory {output_dir:#?}"))?; + + zip + .extract(output_dir) + .wrap_err_with(|| format!("When extracting zip archive of dataset '{}'", dataset.path)) +} + pub fn dataset_dir_load( - run_args: NextcladeRunArgs, + run_args: &NextcladeRunArgs, dataset_dir: impl AsRef, genes: &Option>, -) -> Result { - let input_dataset = dataset_dir.as_ref(); - dataset_load_files(DatasetFilePaths { - input_ref: &run_args.inputs.input_ref.unwrap_or_else(|| input_dataset.join("reference.fasta")), - input_tree: &run_args.inputs.input_tree.unwrap_or_else(|| input_dataset.join("tree.json")), - input_qc_config: &run_args.inputs.input_qc_config.unwrap_or_else(|| input_dataset.join("qc.json")), - input_virus_properties: &run_args.inputs.input_virus_properties.unwrap_or_else(|| input_dataset.join("virus_properties.json")), - input_pcr_primers: &run_args.inputs.input_pcr_primers.unwrap_or_else(|| input_dataset.join("primers.csv")), - input_gene_map: &run_args.inputs.input_gene_map.unwrap_or_else(|| input_dataset.join("genemap.gff")), - }, genes) +) -> Result { + let dataset_dir = dataset_dir.as_ref(); + + let NextcladeRunInputArgs { + input_ref, + input_tree, + input_pathogen_json, + input_annotation, + .. + } = &run_args.inputs; + + let input_pathogen_json = input_pathogen_json + .clone() + .unwrap_or_else(|| dataset_dir.join("pathogen.json")); + + let virus_properties = VirusProperties::from_path(input_pathogen_json)?; + + let input_ref = input_ref + .clone() + .unwrap_or_else(|| dataset_dir.join(&virus_properties.files.reference)); + let ref_record = read_one_fasta(input_ref).wrap_err("When reading reference sequence")?; + + let gene_map = input_annotation + .clone() + .or_else(|| { + virus_properties + .files + .genome_annotation + .as_ref() + .map(|genome_annotation| dataset_dir.join(genome_annotation)) + }) + .map_ref_fallible(GeneMap::from_path) + .wrap_err("When reading genome annotation")? + .map(|gen_map| filter_gene_map(gen_map, genes)) + .unwrap_or_default(); + + let tree = input_tree + .clone() + .or_else(|| { + virus_properties + .files + .tree_json + .as_ref() + .map(|tree_json| dataset_dir.join(tree_json)) + }) + .map_ref_fallible(AuspiceTree::from_path) + .wrap_err("When reading reference tree JSON")?; + + Ok(NextcladeParams { + ref_record, + gene_map, + tree, + virus_properties, + }) } pub fn dataset_individual_files_load( run_args: &NextcladeRunArgs, genes: &Option>, -) -> Result { - #[rustfmt::skip] - let required_args = &[ - (String::from("--input-ref"), &run_args.inputs.input_ref), - (String::from("--input-tree"), &run_args.inputs.input_tree), - (String::from("--input-gene-map"), &run_args.inputs.input_gene_map), - (String::from("--input-qc-config"), &run_args.inputs.input_qc_config), - (String::from("--input-pcr-primers"), &run_args.inputs.input_pcr_primers), - (String::from("--input-virus-properties"), &run_args.inputs.input_virus_properties), - ]; - - #[allow(clippy::single_match_else)] - match required_args { - #[rustfmt::skip] - [ - (_, Some(input_ref)), - (_, Some(input_tree)), - (_, Some(input_gene_map)), - (_, Some(input_qc_config)), - (_, Some(input_pcr_primers)), - (_, Some(input_virus_properties)), - ] => { - dataset_load_files(DatasetFilePaths { - input_ref, - input_tree, - input_qc_config, - input_virus_properties, - input_pcr_primers, - input_gene_map, - }, genes) - }, - _ => { - let missing_args = required_args - .iter() - .filter_map(|(key, val)| match val { - None => Some(key), - Some(_) => None, - }) - .cloned() - .join(" \n"); - - make_error!("When `--input-dataset` is not specified, the following arguments are required:\n{missing_args}") +) -> Result { + match (&run_args.inputs.input_dataset, &run_args.inputs.input_ref) { + (None, None) => make_error!("When `--input-dataset` is not specified, --input-ref is required"), + (_, Some(input_ref)) => { + let virus_properties = run_args + .inputs + .input_pathogen_json + .as_ref() + .and_then(|input_pathogen_json| read_file_to_string(input_pathogen_json).ok()) + .map_ref_fallible(VirusProperties::from_str) + .wrap_err("When reading pathogen JSON")? + .unwrap_or_else(|| { + // The only case where we allow pathogen.json to be missing is when there's no dataset and files are provided + // explicitly through args. Let's create an dummy value to avoid making the field optional + VirusProperties { + schema_version: "".to_owned(), + attributes: DatasetAttributes { + name: DatasetAttributeValue { + value: "".to_owned(), + value_friendly: None, + is_default: None, + other: serde_json::Value::default(), + }, + reference: DatasetAttributeValue { + value: "".to_owned(), + value_friendly: None, + is_default: None, + other: serde_json::Value::default(), + }, + rest_attrs: BTreeMap::default(), + other: serde_json::Value::default(), + }, + files: DatasetFiles { + reference: "".to_owned(), + pathogen_json: "".to_owned(), + genome_annotation: None, + tree_json: None, + examples: None, + readme: None, + changelog: None, + rest_files: BTreeMap::default(), + other: serde_json::Value::default(), + }, + deprecated: false, + enabled: true, + experimental: false, + default_gene: None, + gene_order_preference: vec![], + mut_labels: LabelledMutationsConfig::default(), + primers: vec![], + qc: None, + general_params: None, + alignment_params: None, + tree_builder_params: None, + phenotype_data: None, + aa_motifs: vec![], + versions: vec![], + version: None, + compatibility: None, + other: serde_json::Value::default(), + } + }); + + let ref_record = read_one_fasta(input_ref).wrap_err("When reading reference sequence")?; + + let gene_map = run_args + .inputs + .input_annotation + .as_ref() + .map_ref_fallible(GeneMap::from_path) + .wrap_err("When reading genome annotation")? + .map(|gen_map| filter_gene_map(gen_map, genes)) + .unwrap_or_default(); + + let tree = run_args + .inputs + .input_tree + .as_ref() + .map_ref_fallible(AuspiceTree::from_path) + .wrap_err("When reading reference tree JSON")?; + + Ok(NextcladeParams { + ref_record, + gene_map, + tree, + virus_properties, + }) } + _ => make_internal_error!("Reached unknown match arm"), } } pub struct DatasetFilePaths<'a> { input_ref: &'a Path, - input_tree: &'a Path, - input_qc_config: &'a Path, - input_virus_properties: &'a Path, - input_pcr_primers: &'a Path, - input_gene_map: &'a Path, + input_tree: &'a Option, + input_pathogen_json: &'a Option, + input_annotation: &'a Option, } -pub fn dataset_load_files( - DatasetFilePaths { - input_ref, - input_tree, - input_qc_config, - input_virus_properties, - input_pcr_primers, - input_gene_map, - }: DatasetFilePaths, - genes: &Option>, -) -> Result { - let ref_record = read_one_fasta(input_ref)?; - let primers = PcrPrimer::from_path(input_pcr_primers, &ref_record.seq)?; - - Ok(DatasetFilesContent { - ref_record, - virus_properties: VirusProperties::from_path(input_virus_properties)?, - gene_map: filter_gene_map(Some(GeneMap::from_file(input_gene_map)?), genes)?, - tree: AuspiceTree::from_path(input_tree)?, - qc_config: QcConfig::from_path(input_qc_config)?, - primers, - }) +pub fn read_from_path_or_url( + http: &mut HttpClient, + dataset: &Dataset, + filepath: &Option>, + url: &Option, +) -> Result, Report> { + if let Some(filepath) = filepath { + return Ok(Some(read_file_to_string(filepath)?)); + } else if let Some(url) = url { + return Ok(Some(dataset_file_http_get(http, dataset, url)?)); + } + Ok(None) } pub fn dataset_str_download_and_load( run_args: &NextcladeRunArgs, - dataset_name: &str, genes: &Option>, -) -> Result { +) -> Result { let verbose = log::max_level() > LevelFilter::Info; let mut http = HttpClient::new(&run_args.inputs.server, &ProxyConfig::default(), verbose)?; @@ -233,66 +328,51 @@ pub fn dataset_str_download_and_load( .as_ref() .expect("Dataset name is expected, but got 'None'"); - let dataset = nextclade_dataset_http_get( - &mut http, - DatasetHttpGetParams { - name, - reference: "default", - tag: "latest", - }, - &[], - )?; - - let ref_record = run_args.inputs.input_ref.as_ref().map_or_else( - || read_one_fasta_str(&dataset_file_http_get(&mut http, &dataset, "reference.fasta")?), - read_one_fasta, - )?; - - let tree = run_args.inputs.input_tree.as_ref().map_or_else( - || AuspiceTree::from_str(&dataset_file_http_get(&mut http, &dataset, "tree.json")?), - AuspiceTree::from_path, - )?; - - let qc_config = run_args.inputs.input_qc_config.as_ref().map_or_else( - || QcConfig::from_str(&dataset_file_http_get(&mut http, &dataset, "qc.json")?), - QcConfig::from_path, - )?; - - let virus_properties = run_args.inputs.input_virus_properties.as_ref().map_or_else( - || VirusProperties::from_str(&dataset_file_http_get(&mut http, &dataset, "virus_properties.json")?), - VirusProperties::from_path, - )?; - - let primers = run_args.inputs.input_pcr_primers.as_ref().map_or_else( - || { - PcrPrimer::from_str( - &dataset_file_http_get(&mut http, &dataset, "primers.csv")?, - &ref_record.seq, - ) - }, - |input_pcr_primers| PcrPrimer::from_path(input_pcr_primers, &ref_record.seq), - )?; - - let gene_map = run_args.inputs.input_gene_map.as_ref().map_or_else( - || { - filter_gene_map( - Some(GeneMap::from_str(dataset_file_http_get( - &mut http, - &dataset, - "genemap.gff", - )?)?), - genes, - ) - }, - |input_gene_map| filter_gene_map(Some(GeneMap::from_file(input_gene_map)?), genes), - )?; + let dataset = dataset_http_get(&mut http, name, &None)?; - Ok(DatasetFilesContent { + let virus_properties = read_from_path_or_url( + &mut http, + &dataset, + &run_args.inputs.input_pathogen_json, + &Some(o!("pathogen.json")), + )? + .map_ref_fallible(VirusProperties::from_str) + .wrap_err("When reading pathogen JSON from dataset")? + .ok_or_else(|| eyre!("Required file not found in dataset: 'pathogen.json'. Please report it to dataset authors."))?; + + let ref_record = read_from_path_or_url( + &mut http, + &dataset, + &run_args.inputs.input_ref, + &Some(dataset.files.reference.clone()), + )? + .map_ref_fallible(read_one_fasta_str)? + .wrap_err("When reading reference sequence from dataset")?; + + let gene_map = read_from_path_or_url( + &mut http, + &dataset, + &run_args.inputs.input_annotation, + &dataset.files.genome_annotation, + )? + .map_ref_fallible(GeneMap::from_str) + .wrap_err("When reading genome annotation from dataset")? + .map(|gene_map| filter_gene_map(gene_map, genes)) + .unwrap_or_default(); + + let tree = read_from_path_or_url( + &mut http, + &dataset, + &run_args.inputs.input_tree, + &dataset.files.tree_json, + )? + .map_ref_fallible(AuspiceTree::from_str) + .wrap_err("When reading reference tree from dataset")?; + + Ok(NextcladeParams { ref_record, - virus_properties, - tree, gene_map, - qc_config, - primers, + tree, + virus_properties, }) } diff --git a/packages_rs/nextclade-cli/src/dataset/dataset_table.rs b/packages_rs/nextclade-cli/src/dataset/dataset_table.rs index 7dc02191b..e1396078e 100644 --- a/packages_rs/nextclade-cli/src/dataset/dataset_table.rs +++ b/packages_rs/nextclade-cli/src/dataset/dataset_table.rs @@ -18,18 +18,16 @@ pub fn format_dataset_table(filtered: &[Dataset]) -> String { "reference".to_owned(), "tag".to_owned(), "attributes".to_owned(), - "comment".to_owned(), ]); for dataset in filtered.iter() { let Dataset { - attributes, comment, .. + version, attributes, .. } = dataset; let DatasetAttributes { name, reference, - tag, rest_attrs, .. } = &attributes; @@ -37,7 +35,6 @@ pub fn format_dataset_table(filtered: &[Dataset]) -> String { let mut attrs = IndexMap::::from([ ("name".to_owned(), name), ("reference".to_owned(), reference), - ("tag".to_owned(), tag), ]); for (key, attr) in rest_attrs.iter() { @@ -47,9 +44,8 @@ pub fn format_dataset_table(filtered: &[Dataset]) -> String { table.add_row([ format_attr_value(name), format_attr_value(reference), - format_attr_value(tag), + version.tag.clone(), format_attributes(&attrs), - comment.clone(), ]); } @@ -57,8 +53,8 @@ pub fn format_dataset_table(filtered: &[Dataset]) -> String { } pub fn format_attr_value_short(attr: &DatasetAttributeValue) -> String { - let DatasetAttributeValue { is_default, value, .. } = &attr; - if *is_default { + let DatasetAttributeValue { value, .. } = &attr; + if attr.is_default() { format!("{value} (*)") } else { value.clone() diff --git a/packages_rs/nextclade-cli/src/dataset/mod.rs b/packages_rs/nextclade-cli/src/dataset/mod.rs index 3bed03194..c96e51ebb 100644 --- a/packages_rs/nextclade-cli/src/dataset/mod.rs +++ b/packages_rs/nextclade-cli/src/dataset/mod.rs @@ -1,3 +1,2 @@ -pub mod dataset_attributes; pub mod dataset_download; pub mod dataset_table; diff --git a/packages_rs/nextclade-cli/src/io/http_client.rs b/packages_rs/nextclade-cli/src/io/http_client.rs index 15d67cdf6..ecca60d7b 100644 --- a/packages_rs/nextclade-cli/src/io/http_client.rs +++ b/packages_rs/nextclade-cli/src/io/http_client.rs @@ -1,9 +1,11 @@ use clap::{Parser, ValueHint}; use eyre::Report; use log::info; -use nextclade::{getenv, make_internal_error}; +use nextclade::make_internal_error; +use nextclade::utils::info::{this_package_name, this_package_version_str}; use reqwest::blocking::Client; -use reqwest::{IntoUrl, Method, Proxy}; +use reqwest::{Method, Proxy}; +use std::str::FromStr; use url::Url; #[derive(Parser, Debug, Default)] @@ -32,6 +34,10 @@ pub struct HttpClient { impl HttpClient { pub fn new(root: &Url, proxy_conf: &ProxyConfig, verbose: bool) -> Result { + // Append trailing slash to the root URL. Otherwise `Url::join()` replaces the path rather than appending. + // See: https://github.com/servo/rust-url/issues/333 + let root = Url::from_str(&format!("{}/", root.as_str()))?; + let mut client_builder = Client::builder(); client_builder = if let Some(proxy_url) = &proxy_conf.proxy { @@ -52,7 +58,7 @@ impl HttpClient { client_builder }; - let user_agent = format!("{} {}", getenv!("CARGO_PKG_NAME"), getenv!("CARGO_PKG_VERSION")); + let user_agent = format!("{} {}", this_package_name(), this_package_version_str()); let client = client_builder .connection_verbose(verbose) @@ -60,40 +66,46 @@ impl HttpClient { .user_agent(user_agent) .build()?; - Ok(Self { - client, - root: root.clone(), - }) + Ok(Self { client, root }) } - pub fn get(&self, url: &U) -> Result, Report> { + pub fn get + ?Sized>(&self, url: &U) -> Result, Report> { self.request(Method::GET, url) } - pub fn post(&self, url: &U) -> Result, Report> { + pub fn post + ?Sized>(&self, url: &U) -> Result, Report> { self.request(Method::POST, url) } - pub fn put(&self, url: &U) -> Result, Report> { + pub fn put + ?Sized>(&self, url: &U) -> Result, Report> { self.request(Method::PUT, url) } - pub fn patch(&self, url: &U) -> Result, Report> { + pub fn patch + ?Sized>(&self, url: &U) -> Result, Report> { self.request(Method::PATCH, url) } - pub fn delete(&self, url: &U) -> Result, Report> { + pub fn delete + ?Sized>(&self, url: &U) -> Result, Report> { self.request(Method::DELETE, url) } - pub fn head(&self, url: &U) -> Result, Report> { + pub fn head + ?Sized>(&self, url: &U) -> Result, Report> { self.request(Method::HEAD, url) } - pub fn request(&self, method: Method, url: &U) -> Result, Report> { - let abs_url = self.root.join(url.as_str())?; + pub fn request + ?Sized>(&self, method: Method, url: &U) -> Result, Report> { + // Trim leading '/', otherwise Url::join() replaces the path rather than appending. + // See: https://github.com/servo/rust-url/issues/333 + let url = url.as_ref().trim_start_matches('/'); + let abs_url = self.root.join(url)?; info!("HTTP '{method}' request to '{abs_url}'"); - let content = self.client.request(method, abs_url).send()?.bytes()?.to_vec(); + let content = self + .client + .request(method, abs_url) + .send()? + .error_for_status()? + .bytes()? + .to_vec(); Ok(content) } } diff --git a/packages_rs/nextclade-web/Cargo.toml b/packages_rs/nextclade-web/Cargo.toml index 13364ba34..08f8b8da2 100644 --- a/packages_rs/nextclade-web/Cargo.toml +++ b/packages_rs/nextclade-web/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "nextclade-web" -version = "2.14.0" +version = "3.0.0-alpha.0" description = "Alignment, mutation calling, phylogenetic placement, clade assignment and quality control checks for viral genetic sequences. WebAssembly module." edition = "2021" license = "MIT" @@ -12,6 +12,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] assert2 = "=0.3.11" +chrono = { version = "=0.4.26", default-features = false, features = ["clock", "std", "wasmbind"] } console_error_panic_hook = "=0.1.7" eyre = "=0.6.8" getrandom = { version = "=0.2.10", features = ["js"] } @@ -19,12 +20,12 @@ itertools = "=0.11.0" js-sys = { version = "=0.3.64", features = [] } log = "=0.4.19" nextclade = { path = "../nextclade" } +schemars = { version = "=0.8.12", features = ["chrono", "either", "enumset", "indexmap1"] } serde = { version = "=1.0.164", features = ["derive"] } serde-wasm-bindgen = { version = "=0.5.0" } wasm-bindgen = { version = "=0.2.87", features = ["serde-serialize"] } wasm-logger = "=0.2.0" web-sys = { version = "=0.3.64", features = ["console"] } -schemars = { version = "=0.8.12", features = ["chrono", "either", "enumset", "indexmap1"] } [build-dependencies] nextclade = { path = "../nextclade" } diff --git a/packages_rs/nextclade-web/config/next/lib/getEnvVars.ts b/packages_rs/nextclade-web/config/next/lib/getEnvVars.ts index b040b9fd3..ef8a0b4f3 100644 --- a/packages_rs/nextclade-web/config/next/lib/getEnvVars.ts +++ b/packages_rs/nextclade-web/config/next/lib/getEnvVars.ts @@ -10,6 +10,7 @@ export function getEnvVars() { const DOMAIN = getDomain() const DOMAIN_STRIPPED = DOMAIN.replace('https://', '').replace('http://', '') const DATA_FULL_DOMAIN = getenv('DATA_FULL_DOMAIN') + const DATA_TRY_GITHUB_BRANCH = getenv('DATA_TRY_GITHUB_BRANCH') const common = { BABEL_ENV, @@ -20,6 +21,7 @@ export function getEnvVars() { DOMAIN, DOMAIN_STRIPPED, DATA_FULL_DOMAIN, + DATA_TRY_GITHUB_BRANCH, } if (PRODUCTION) { diff --git a/packages_rs/nextclade-web/config/next/next.config.ts b/packages_rs/nextclade-web/config/next/next.config.ts index f38858486..22cd557fa 100644 --- a/packages_rs/nextclade-web/config/next/next.config.ts +++ b/packages_rs/nextclade-web/config/next/next.config.ts @@ -46,6 +46,7 @@ const { DOMAIN, DOMAIN_STRIPPED, DATA_FULL_DOMAIN, + DATA_TRY_GITHUB_BRANCH, } = getEnvVars() const BRANCH_NAME = getGitBranch() @@ -61,6 +62,7 @@ const clientEnv = { DOMAIN, DOMAIN_STRIPPED, DATA_FULL_DOMAIN, + DATA_TRY_GITHUB_BRANCH, BLOCK_SEARCH_INDEXING: DOMAIN === RELEASE_URL ? '0' : '1', } diff --git a/packages_rs/nextclade-web/package.json b/packages_rs/nextclade-web/package.json index c33efea45..7c5b95673 100644 --- a/packages_rs/nextclade-web/package.json +++ b/packages_rs/nextclade-web/package.json @@ -1,6 +1,6 @@ { "name": "@nextstrain/nextclade-web", - "version": "2.14.1", + "version": "3.0.0-alpha.0", "description": "Clade assignment, mutation calling, and sequence quality checks", "homepage": "https://clades.nextstrain.org", "repository": { @@ -109,6 +109,7 @@ "i18next": "19.3.2", "immutable": "4.0.0", "intercept-stdout": "0.1.2", + "is-absolute-url": "3.0.3", "jschardet": "3.0.0", "jsonexport": "3.2.0", "jszip": "3.9.1", @@ -116,6 +117,7 @@ "luxon": "2.3.2", "marked": "4.0.14", "memoize-one": "6.0.0", + "nanoid": "3.3.6", "next": "12.1.6", "next-compose-plugins": "2.2.1", "numbro": "2.3.6", @@ -132,9 +134,10 @@ "react-file-icon": "1.1.0", "react-helmet": "6.1.0", "react-i18next": "11.3.3", - "react-icons": "4.3.1", + "react-icons": "4.11.0", "react-if": "4.1.4", "react-loader-spinner": "5.1.4", + "react-markdown": "6.0.3", "react-no-ssr": "1.1.0", "react-redux": "7.2.8", "react-resize-detector": "7.0.0", @@ -145,13 +148,16 @@ "react-window": "1.8.7", "reactstrap": "8.10.1", "recharts": "2.1.9", - "recoil": "0.7.6", + "recoil": "0.7.7", "recoil-persist": "4.2.0", "redux": "4.2.0", "redux-saga": "1.1.3", "redux-thunk": "2.4.1", "reflect-metadata": "0.1.13", "regenerator-runtime": "0.13.9", + "rehype-raw": "5.1.0", + "rehype-sanitize": "4.0.0", + "remark-gfm": "1.0.0", "reselect": "4.1.5", "semver": "7.3.7", "serialize-javascript": "6.0.0", @@ -234,13 +240,13 @@ "allow-methods": "3.1.0", "babel-plugin-parameter-decorator": "1.0.16", "babel-plugin-transform-typescript-metadata": "0.3.2", + "commander": "10.0.1", "compression-webpack-plugin": "9.2.0", "connect-history-api-fallback": "1.6.0", "conventional-changelog-cli": "2.2.2", "copy-webpack-plugin": "10.2.4", "cross-env": "7.0.3", "css-loader": "6.7.1", - "commander": "10.0.1", "dotenv": "16.0.0", "eslint": "8.14.0", "eslint-config-airbnb": "19.0.4", @@ -319,7 +325,7 @@ "remark-autolink-headings": "6.0.1", "remark-breaks": "2.0.1", "remark-images": "2.0.0", - "remark-math": "3.0.1", + "remark-math": "4.0.0", "remark-slug": "6.0.0", "remark-toc": "7.0.0", "rimraf": "3.0.2", diff --git a/packages_rs/nextclade-web/src/build.rs b/packages_rs/nextclade-web/src/build.rs index a857750ce..6dcd7284b 100644 --- a/packages_rs/nextclade-web/src/build.rs +++ b/packages_rs/nextclade-web/src/build.rs @@ -1,9 +1,11 @@ use eyre::Report; -use nextclade::analyze::pcr_primers::PcrPrimer; +use nextclade::analyze::pcr_primer_changes::PcrPrimer; use nextclade::analyze::virus_properties::{PhenotypeAttrDesc, VirusProperties}; use nextclade::gene::gene_map::GeneMap; -use nextclade::io::dataset::{DatasetTagJson, DatasetsIndexJson}; -use nextclade::io::errors_csv::ErrorsFromWeb; +use nextclade::io::dataset::{ + Dataset, DatasetAttributeValue, DatasetAttributes, DatasetCapabilities, DatasetCollectionMeta, DatasetCollectionUrl, + DatasetsIndexJson, +}; use nextclade::io::fasta::FastaRecord; use nextclade::io::file::create_file_or_stdout; use nextclade::io::fs::ensure_dir; @@ -14,6 +16,8 @@ use nextclade::qc::qc_run::QcResult; use nextclade::run::nextclade_wasm::{ AnalysisInitialData, AnalysisInput, NextcladeParams, NextcladeParamsRaw, NextcladeResult, OutputTrees, }; +use nextclade::sort::minimizer_index::MinimizerIndexJson; +use nextclade::sort::minimizer_search::{MinimizerSearchRecord, MinimizerSearchResult}; use nextclade::translate::translate_genes::Translation; use nextclade::tree::tree::{AuspiceTree, CladeNodeAttrKeyDesc}; use nextclade::types::outputs::{NextcladeErrorOutputs, NextcladeOutputs}; @@ -44,7 +48,7 @@ fn write_jsonschema(output_file: impl AsRef) -> Result<(), /// it. Instead, See the actual types in the `definitions` property of JSON schema. #[derive(Clone, Debug, JsonSchema)] #[serde(rename_all = "camelCase")] -struct _SchemaRoot { +struct _SchemaRoot<'a> { _1: GeneMap, _2: Translation, _3: AuspiceTree, @@ -52,19 +56,26 @@ struct _SchemaRoot { _5: QcResult, _6: PcrPrimer, _7: NextcladeOutputs, - _8: DatasetsIndexJson, _9: CsvColumnConfig, _10: NextcladeErrorOutputs, - _11: ErrorsFromWeb, _12: VirusProperties, _13: CladeNodeAttrKeyDesc, _14: PhenotypeAttrDesc, _15: FastaRecord, - _16: DatasetTagJson, - _17: AnalysisInitialData, + _17: AnalysisInitialData<'a>, _18: AnalysisInput, _19: NextcladeResult, - _21: NextcladeParams, - _22: NextcladeParamsRaw, - _23: OutputTrees, + _20: NextcladeParams, + _21: NextcladeParamsRaw, + _22: OutputTrees, + _23: DatasetsIndexJson, + _24: Dataset, + _25: DatasetCollectionMeta, + _26: DatasetCapabilities, + _27: DatasetAttributeValue, + _28: DatasetAttributes, + _29: DatasetCollectionUrl, + _30: MinimizerIndexJson, + _31: MinimizerSearchResult, + _32: MinimizerSearchRecord, } diff --git a/packages_rs/nextclade-web/src/components/About/About.tsx b/packages_rs/nextclade-web/src/components/About/About.tsx deleted file mode 100644 index 7f15d6fde..000000000 --- a/packages_rs/nextclade-web/src/components/About/About.tsx +++ /dev/null @@ -1,7 +0,0 @@ -import React from 'react' - -import AboutContent from './AboutContent.mdx' - -export function About() { - return -} diff --git a/packages_rs/nextclade-web/src/components/About/AboutContent.mdx b/packages_rs/nextclade-web/src/components/About/AboutContent.mdx deleted file mode 100644 index 9f97e267a..000000000 --- a/packages_rs/nextclade-web/src/components/About/AboutContent.mdx +++ /dev/null @@ -1,43 +0,0 @@ -import { CladeSchema } from 'src/components/Main/CladeSchema.tsx' - -## What is Nextclade? - -Nextclade is a tool that performs genetic sequence alignment, clade assignment, mutation calling, phylogenetic placement, and quality checks for SARS-CoV-2, Influenza (Flu), Mpox (Monkeypox), Respiratory Syncytial Virus (RSV) and other pathogens. - -Nextclade identifies differences between your sequences and a reference sequence, uses these differences to assign your sequences to clades, reports potential sequence quality issues in your data, and shows how the sequences are related to each other by placing them into an existing phylogenetic tree (we call it "phylogenetic placement"). You can use the tool to analyze sequences before you upload them to a database, or if you want to assign Nextstrain clades to a set of sequences. - -To analyze your data, drag a fasta file onto the upload box or paste sequences into the text box. These sequences will then be analyzed in your browser - data never leave your computer. Since your computer is doing the work rather than a server, it is advisable to analyze at most a few hundred sequences at a time. - -The Nextclade app and algorithms are opensource. The code is available on [GitHub](https://github.com/nextstrain/nextclade). The user manual is available at [docs.nextstrain.org/projects/nextclade](https://docs.nextstrain.org/projects/nextclade). - - -### What are the SARS-CoV-2 clades? - -Nextclade was originally developed during COVID-19 pandemic, primarily focused on SARS-CoV-2. This section describes clades with application to SARS-CoV-2, but Nextclade can analyse other pathogens too. - - - -Since its emergence in late 2019, SARS-CoV-2 has diversified into several different co-circulating variants. To facilitate discussion of these variants, we have grouped them into __clades__ which are defined by specific signature mutations. - -We currently define more than 30 clades (see [this blog post](https://nextstrain.org/blog/2021-01-06-updated-SARS-CoV-2-clade-naming) for details): - -- 19A and 19B emerged in Wuhan and have dominated the early outbreak -- 20A emerged from 19A out of dominated the European outbreak in March and has since spread globally -- 20B and 20C are large genetically distinct subclades 20A emerged in early 2020 -- 20D to 20J have emerged over the summer of 2020 and include three "Variants of Concern" (VoC). -- 21A to 21F include the VoC __delta__ and several Variants of Interest (VoI). -- 21K onwards are different clades within the diverse VoC __omicron__. - -Within Nextstrain, we define each clade by its combination of signature mutations. You can find the exact clade definition in [github.com/nextstrain/ncov/defaults/clades.tsv](https://github.com/nextstrain/ncov/blob/master/defaults/clades.tsv). When available, we will include [WHO labels for VoCs and VoIs](https://www.who.int/en/activities/tracking-SARS-CoV-2-variants/). - -Learn more about how Nextclade assigns clades in the [documentation](https://docs.nextstrain.org/projects/nextclade/en/stable/user/algorithm/). - -### Other pathogens - -Besides SARS-CoV-2, we provide Nextclade datasets to analyze the following other pathogens: - - * Seasonal Influenza viruses (HA and NA for A/H3N2, A/H1N1pdm, B/Vic, and B/Yam) - * Mpox virus (the overall clade structure, as well as fine-grained lineages within the recent sustained human-to-human transmission) - * Respiratory Syncytial Virus (RSV) (subtypes A and B) - -You can also put together your own dataset to analyse other pathogens. diff --git a/packages_rs/nextclade-web/src/components/Autodetect/AutodetectPage.tsx b/packages_rs/nextclade-web/src/components/Autodetect/AutodetectPage.tsx new file mode 100644 index 000000000..4dd2a51e6 --- /dev/null +++ b/packages_rs/nextclade-web/src/components/Autodetect/AutodetectPage.tsx @@ -0,0 +1,249 @@ +// import classNames from 'classnames' +// import { sortBy } from 'lodash' +// import { mix, transparentize } from 'polished' +// import React, { useMemo } from 'react' +// import { Col as ColBase, Row as RowBase } from 'reactstrap' +// import { useRecoilValue } from 'recoil' +// import styled, { useTheme } from 'styled-components' +// import type { MinimizerIndexJson, MinimizerSearchRecord } from 'src/types' +// import { isEven } from 'src/helpers/number' +// import { TableSlim } from 'src/components/Common/TableSlim' +// import { Layout } from 'src/components/Layout/Layout' +// import { safeZip3 } from 'src/helpers/safeZip' +// import { useTranslationSafe } from 'src/helpers/useTranslationSafe' +// import { autodetectResultsAtom, minimizerIndexAtom } from 'src/state/autodetect.state' +// +// const Container = styled.div` +// margin-top: 1rem; +// padding-bottom: 1.5rem; +// height: 100%; +// overflow: hidden; +// ` +// +// const Row = styled(RowBase)` +// overflow: hidden; +// height: 100%; +// ` +// +// const Col = styled(ColBase)` +// overflow: hidden; +// height: 100%; +// ` +// +// const Table = styled(TableSlim)` +// padding-top: 50px; +// +// & thead { +// height: 51px; +// position: sticky; +// top: -2px; +// background-color: ${(props) => props.theme.gray700}; +// color: ${(props) => props.theme.gray100}; +// } +// +// & thead th { +// margin: auto; +// text-align: center; +// vertical-align: middle; +// } +// +// & td { +// border: none; +// border-left: 1px solid #ccc; +// } +// +// & tr { +// border: none !important; +// } +// +// & th { +// border: 1px solid #ccc; +// } +// ` +// +// const TableWrapper = styled.div` +// height: 100%; +// overflow-y: auto; +// ` +// +// export function AutodetectPage() { +// const { t } = useTranslationSafe() +// const minimizerIndex = useRecoilValue(minimizerIndexAtom) +// const autodetectResults = useRecoilValue(autodetectResultsAtom) +// +// const rows = useMemo(() => { +// const results = sortBy(autodetectResults, (result) => result.fastaRecord.index) +// return results.map((res, i) => ( +// +// )) +// }, [autodetectResults, minimizerIndex]) +// +// return ( +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// {rows} +//
{'#'}{t('Seq. name')}{t('Length')}{t('Total nHits')}{t('Max norm. hit')}{t('Dataset')}{t('Ref. length')}{t('Num. nHits')}{t('Norm. hit')}
+//
+// +//
+//
+//
+// ) +// } +// +// interface AutodetectTableRowSpanProps { +// order: number +// res: MinimizerSearchRecord +// minimizerIndex: MinimizerIndexJson +// } +// +// function AutodetectTableRowSpan({ order, res, minimizerIndex }: AutodetectTableRowSpanProps) { +// const theme = useTheme() +// +// const { datasets, maxScore, totalHits } = res.result +// const { seqName, index: seqIndex, seq } = res.fastaRecord +// const qryLen = seq.length +// +// const rows = useMemo(() => { +// let entries = sortBy(datasets, (entry) => -entry.score) +// +// let color = isEven(order) ? theme.table.rowBg.even : theme.table.rowBg.odd +// +// const goodEntries = entries.filter(({ score, nHits }) => maxScore >= 0.6 && nHits >= 10 && score >= maxScore * 0.5) +// +// const mediocreEntries = entries.filter( +// ({ score, nHits }) => maxScore >= 0.3 && nHits >= 10 && score >= maxScore * 0.5, +// ) +// +// const badEntries = entries.filter(({ score, nHits }) => maxScore >= 0.05 && nHits > 0 && score >= maxScore * 0.5) +// +// if (goodEntries.length > 0) { +// entries = goodEntries +// } else if (mediocreEntries.length > 0) { +// entries = mediocreEntries +// color = mix(0.3, transparentize(0.3)(theme.warning), color) +// } else { +// entries = badEntries +// color = mix(0.5, transparentize(0.5)(theme.danger), color) +// } +// +// return entries.map(({ dataset, score, nHits, refLen }, i) => { +// const cls = classNames(i === 0 && 'font-weight-bold') +// +// return ( +// +// {i === 0 && ( +// <> +// +// {seqIndex} +// +// +// +// {seqName} +// +// +// +// {qryLen} +// +// +// +// {totalHits} +// +// +// +// {maxScore.toFixed(3)} +// +// +// )} +// +// {dataset} +// +// +// {refLen} +// +// +// {nHits} +// +// +// {score.toFixed(3)} +// +// +// ) +// }) +// }, [ +// datasets, +// order, +// theme.table.rowBg.even, +// theme.table.rowBg.odd, +// theme.warning, +// theme.danger, +// maxScore, +// seqIndex, +// seqName, +// qryLen, +// totalHits, +// ]) +// +// return ( +// <> +// {rows} +// +// +// +// +// ) +// } +// +// const Tr = styled.tr<{ $bg?: string }>` +// background-color: ${(props) => props.$bg}; +// ` +// +// const Td = styled.td` +// white-space: nowrap; +// overflow: hidden; +// text-overflow: ellipsis; +// max-width: 100px; +// font-size: 0.95rem; +// ` +// +// const TdName = styled(Td)` +// min-width: 200px; +// font-size: 0.9rem; +// ` +// +// const TdNumeric = styled(Td)` +// text-align: right; +// font-family: ${(props) => props.theme.font.monospace}; +// font-size: 0.9rem; +// ` +// +// const TdIndex = styled(TdNumeric)` +// background-color: ${(props) => props.theme.gray700}; +// color: ${(props) => props.theme.gray100}; +// ` +// +// const TrSpacer = styled.tr` +// height: 2px; +// +// & td { +// background-color: ${(props) => props.theme.gray400}; +// } +// ` diff --git a/packages_rs/nextclade-web/src/components/Citation/CitationButton.tsx b/packages_rs/nextclade-web/src/components/Citation/CitationButton.tsx index e8083cc76..96a586f51 100644 --- a/packages_rs/nextclade-web/src/components/Citation/CitationButton.tsx +++ b/packages_rs/nextclade-web/src/components/Citation/CitationButton.tsx @@ -1,5 +1,4 @@ import React, { useCallback, useState } from 'react' - import { useTranslationSafe as useTranslation } from 'src/helpers/useTranslationSafe' import { Button, @@ -13,19 +12,33 @@ import { Row, } from 'reactstrap' import styled from 'styled-components' -import { HiOutlineAcademicCap } from 'react-icons/hi' - -import { ButtonTransparent } from 'src/components/Common/ButtonTransparent' import { Citation } from 'src/components/Citation/Citation' -export const ButtonCitationBase = styled(ButtonTransparent)` - margin: 2px 2px; - height: 38px; - width: 50px; - color: ${(props) => props.theme.gray700}; - - @media (min-width: 1200px) { - width: 100px; +export const ButtonCitationBase = styled(Button)` + color: ${(props) => props.theme.bodyColor}; + padding: 0; + background-color: transparent; + background-image: none; + border: none; + border-radius: 0; + box-shadow: none; + border-image: none; + text-decoration: none; + -webkit-tap-highlight-color: #ccc; + + & .active, + & :active, + & :hover, + & :target, + & :focus, + & :focus-visible, + & :focus-within { + background-color: transparent; + background-image: none; + border: none; + border-radius: 0; + box-shadow: none; + border-image: none; } ` @@ -100,8 +113,7 @@ export function CitationButton() { return ( <> - - + {text} diff --git a/packages_rs/nextclade-web/src/components/Common/List.tsx b/packages_rs/nextclade-web/src/components/Common/List.tsx index e8496a60f..0ec0fdc27 100644 --- a/packages_rs/nextclade-web/src/components/Common/List.tsx +++ b/packages_rs/nextclade-web/src/components/Common/List.tsx @@ -1,4 +1,4 @@ -import styled from 'styled-components' +import styled, { css } from 'styled-components' export const Ul = styled.ul` padding-left: 1.5rem; @@ -13,3 +13,35 @@ export const UlInvisible = styled.ul` export const LiInvisible = styled.li` list-style: none; ` + +// @formatter:off +// prettier-ignore +export const ScrollShadowVerticalCss = css` + /** Taken from: https://css-tricks.com/books/greatest-css-tricks/scroll-shadows */ + background: + /* Shadow Cover TOP */ linear-gradient(white 30%, rgba(255, 255, 255, 0)) center top, + /* Shadow Cover BOTTOM */ linear-gradient(rgba(255, 255, 255, 0), white 70%) center bottom, + /* Shadow TOP */ radial-gradient(farthest-side at 50% 0, rgba(0, 0, 0, 0.2), rgba(0, 0, 0, 0)) center top, + /* Shadow BOTTOM */ radial-gradient(farthest-side at 50% 100%, rgba(0, 0, 0, 0.2), rgba(0, 0, 0, 0)) center bottom; + background-repeat: no-repeat; + background-size: 100% 40px, 100% 40px, 100% 14px, 100% 14px; + background-attachment: local, local, scroll, scroll; +` +// @formatter:on + +export const ListGenericCss = css` + ${ScrollShadowVerticalCss}; + list-style: none; + padding: 0; + margin: 0; + -webkit-overflow-scrolling: touch; + overflow-scrolling: touch; + + & li { + border: 0; + } +` + +export const UlGeneric = styled.ul` + ${ListGenericCss} +` diff --git a/packages_rs/nextclade-web/src/components/Common/Markdown.tsx b/packages_rs/nextclade-web/src/components/Common/Markdown.tsx new file mode 100644 index 000000000..e9c0e22d1 --- /dev/null +++ b/packages_rs/nextclade-web/src/components/Common/Markdown.tsx @@ -0,0 +1,45 @@ +import React, { Suspense } from 'react' +import ReactMarkdown from 'react-markdown' +import remarkGfm from 'remark-gfm' +import rehypeRaw from 'rehype-raw' +import rehypeSanitize from 'rehype-sanitize' +import { LinkExternal } from 'src/components/Link/LinkExternal' +import { useAxiosQuery } from 'src/helpers/useAxiosQuery' +import { LOADING } from 'src/components/Loading/Loading' + +const REMARK_PLUGINS = [remarkGfm] + +const REHYPE_PLUGINS = [rehypeRaw, rehypeSanitize] + +const MD_COMPONENTS = { + a: LinkExternal, +} + +export interface MarkdownProps { + content: string +} + +export function Markdown({ content }: MarkdownProps) { + return ( + + {content} + + ) +} + +export interface MarkdownRemoteProps { + url: string +} + +function MarkdownRemoteImpl({ url }: MarkdownRemoteProps) { + const content = useAxiosQuery(url) + return +} + +export function MarkdownRemote({ url }: MarkdownRemoteProps) { + return ( + + + + ) +} diff --git a/packages_rs/nextclade-web/src/components/Common/SearchBox.tsx b/packages_rs/nextclade-web/src/components/Common/SearchBox.tsx new file mode 100644 index 000000000..c5b634d7a --- /dev/null +++ b/packages_rs/nextclade-web/src/components/Common/SearchBox.tsx @@ -0,0 +1,96 @@ +import React, { ChangeEvent, useCallback, useMemo, HTMLProps } from 'react' +import styled from 'styled-components' +import { Form, Input as InputBase } from 'reactstrap' +import { MdSearch as IconSearchBase, MdClear as IconClearBase } from 'react-icons/md' +import { useTranslationSafe } from 'src/helpers/useTranslationSafe' +import { ButtonTransparent } from 'src/components/Common/ButtonTransparent' + +const SearchForm = styled(Form)` + display: inline; + position: relative; +` + +const IconSearchWrapper = styled.span` + display: inline; + position: absolute; + padding: 5px 7px; +` + +const IconSearch = styled(IconSearchBase)` + * { + color: ${(props) => props.theme.gray500}; + } +` + +const ButtonClear = styled(ButtonTransparent)` + display: inline; + position: absolute; + right: 0; + padding: 0 7px; +` + +const IconClear = styled(IconClearBase)` + * { + color: ${(props) => props.theme.gray500}; + } +` + +const Input = styled(InputBase)` + display: inline !important; + padding-left: 35px; + padding-right: 30px; + height: 2.2em; +` + +export interface SearchBoxProps extends Omit, 'as'> { + searchTitle?: string + searchTerm: string + onSearchTermChange(term: string): void +} + +export function SearchBox({ searchTitle, searchTerm, onSearchTermChange, ...restProps }: SearchBoxProps) { + const { t } = useTranslationSafe() + + const onChange = useCallback( + (event: ChangeEvent) => { + onSearchTermChange(event.target.value) + }, + [onSearchTermChange], + ) + + const onClear = useCallback(() => { + onSearchTermChange('') + }, [onSearchTermChange]) + + const buttonClear = useMemo(() => { + if (searchTerm.length === 0) { + return null + } + return ( + + + + ) + }, [onClear, searchTerm.length, t]) + + return ( + + + + + + {buttonClear} + + ) +} diff --git a/packages_rs/nextclade-web/src/components/Common/Toggle.tsx b/packages_rs/nextclade-web/src/components/Common/Toggle.tsx index ab6afbcb6..5f15d029a 100644 --- a/packages_rs/nextclade-web/src/components/Common/Toggle.tsx +++ b/packages_rs/nextclade-web/src/components/Common/Toggle.tsx @@ -6,6 +6,8 @@ import ReactToggle, { ToggleProps as ReactToggleProps } from 'react-toggle' import 'react-toggle/style.css' export const ToggleBase = styled(ReactToggle)` + display: block; + &.react-toggle-custom { & > .react-toggle-track { background-color: #9c3434; diff --git a/packages_rs/nextclade-web/src/components/FilePicker/FilePicker.tsx b/packages_rs/nextclade-web/src/components/FilePicker/FilePicker.tsx index 30f2e95b9..87aaf3d68 100644 --- a/packages_rs/nextclade-web/src/components/FilePicker/FilePicker.tsx +++ b/packages_rs/nextclade-web/src/components/FilePicker/FilePicker.tsx @@ -16,6 +16,7 @@ import { UploadedFileInfo } from './UploadedFileInfo' import { UploadedFileInfoCompact } from './UploadedFileInfoCompact' export const FilePickerContainer = styled.div` + flex: 1; display: flex; flex-direction: column; ` @@ -38,7 +39,11 @@ export const FilePickerTitle = styled.h4` margin: auto 0; ` -export const TabsPanelStyled = styled(TabsPanel)`` +export const TabsPanelStyled = styled(TabsPanel)` + * { + background: transparent !important; + } +` const TabsContentStyled = styled(TabsContent)` height: 100%; @@ -106,12 +111,12 @@ export function FilePicker({ const onPaste = useCallback( (content: string) => { if (multiple) { - onInputs?.([new AlgorithmInputString(content)]) + onInputs?.([new AlgorithmInputString(content, t('Pasted sequences'))]) } else { - onInput?.(new AlgorithmInputString(content)) + onInput?.(new AlgorithmInputString(content, t('Pasted sequences'))) } }, - [multiple, onInput, onInputs], + [multiple, onInput, onInputs, t], ) // eslint-disable-next-line no-void diff --git a/packages_rs/nextclade-web/src/components/FilePicker/FilePickerAdvanced.tsx b/packages_rs/nextclade-web/src/components/FilePicker/FilePickerAdvanced.tsx index d77cae09a..0eac95b9f 100644 --- a/packages_rs/nextclade-web/src/components/FilePicker/FilePickerAdvanced.tsx +++ b/packages_rs/nextclade-web/src/components/FilePicker/FilePickerAdvanced.tsx @@ -4,20 +4,8 @@ import { useTranslationSafe as useTranslation } from 'src/helpers/useTranslation import { Col, Row } from 'reactstrap' import { useRecoilState, useRecoilValue, useResetRecoilState } from 'recoil' -import { - geneMapErrorAtom, - qcConfigErrorAtom, - refSeqErrorAtom, - refTreeErrorAtom, - virusPropertiesErrorAtom, -} from 'src/state/error.state' -import { - geneMapInputAtom, - qcConfigInputAtom, - refSeqInputAtom, - refTreeInputAtom, - virusPropertiesInputAtom, -} from 'src/state/inputs.state' +import { geneMapErrorAtom, refSeqErrorAtom, refTreeErrorAtom, virusPropertiesErrorAtom } from 'src/state/error.state' +import { geneMapInputAtom, refSeqInputAtom, refTreeInputAtom, virusPropertiesInputAtom } from 'src/state/inputs.state' import { FileIconFasta, FileIconGff, FileIconJson } from 'src/components/Common/FileIcons' import { FilePicker } from 'src/components/FilePicker/FilePicker' @@ -37,10 +25,6 @@ export function FilePickerAdvanced() { const refTreeError = useRecoilValue(refTreeErrorAtom) const resetRefTree = useResetRecoilState(refTreeInputAtom) - const [qcConfig, setQcConfig] = useRecoilState(qcConfigInputAtom) - const qcConfigError = useRecoilValue(qcConfigErrorAtom) - const resetQcConfig = useResetRecoilState(qcConfigInputAtom) - const [virusProperties, setVirusProperties] = useRecoilState(virusPropertiesInputAtom) const virusPropertiesError = useRecoilValue(virusPropertiesErrorAtom) const resetVirusProperties = useResetRecoilState(virusPropertiesInputAtom) @@ -78,19 +62,6 @@ export function FilePickerAdvanced() { onInput={setRefSeq} /> - - ( - {t('Drag & drop files')} + {t('Drag & drop files or folders')} {t('Select files')} ), diff --git a/packages_rs/nextclade-web/src/components/Layout/Footer.tsx b/packages_rs/nextclade-web/src/components/Layout/Footer.tsx index b7b2bdfd6..42dbdc094 100644 --- a/packages_rs/nextclade-web/src/components/Layout/Footer.tsx +++ b/packages_rs/nextclade-web/src/components/Layout/Footer.tsx @@ -1,29 +1,27 @@ import React from 'react' - import { useTranslationSafe as useTranslation } from 'src/helpers/useTranslationSafe' -import { Col, Container, Row } from 'reactstrap' +import { Col, Row } from 'reactstrap' import styled from 'styled-components' - import { PROJECT_NAME, COMPANY_NAME } from 'src/constants' import { getCopyrightYearRange } from 'src/helpers/getCopyrightYearRange' import { LinkExternal } from 'src/components/Link/LinkExternal' import { getVersionString } from 'src/helpers/getVersionString' - import LogoBedfordlab from 'src/assets/img/bedfordlab.svg' import LogoBiozentrum from 'src/assets/img/biozentrum_square.svg' import LogoSib from 'src/assets/img/sib.logo.svg' import LogoFredHutch from 'src/assets/img/fred_hutch.svg' import LogoNeherlab from 'src/assets/img/neherlab.svg' -// impoas from from 'src/assets/img/nextstrain_logo.svg' -// impoas from from 'src/assets/img/unibas.svg' import LogoVercel from 'src/assets/img/powered-by-vercel.svg' -const FooterContainer = styled(Container)` - background-color: #2a2a2a; - color: #c4cdd5; +const Container = styled.footer` + height: 38px; + width: 100%; + bottom: 0; padding: 6px 10px; - border-top-left-radius: 3px; - border-top-right-radius: 3px; + box-shadow: ${(props) => props.theme.shadows.large}; + z-index: 1000; + background-color: ${(props) => props.theme.white}; + opacity: 1; ` const CopyrightText = styled.div` @@ -76,12 +74,12 @@ const VersionText = styled.div` } ` -export default function Footer() { +export function Footer() { const { t } = useTranslation() const copyrightYearRange = getCopyrightYearRange() return ( - + @@ -121,6 +119,6 @@ export default function Footer() { {getVersionString()} - + ) } diff --git a/packages_rs/nextclade-web/src/components/Layout/LanguageSwitcher.tsx b/packages_rs/nextclade-web/src/components/Layout/LanguageSwitcher.tsx index d0455bf72..a788f45ec 100644 --- a/packages_rs/nextclade-web/src/components/Layout/LanguageSwitcher.tsx +++ b/packages_rs/nextclade-web/src/components/Layout/LanguageSwitcher.tsx @@ -1,7 +1,13 @@ import React, { useCallback, useMemo, useState } from 'react' -import { Dropdown, DropdownToggle, DropdownMenu, DropdownItem, DropdownProps } from 'reactstrap' +import { + Dropdown as DropdownBase, + DropdownToggle as DropdownToggleBase, + DropdownMenu as DropdownMenuBase, + DropdownItem, + DropdownProps, +} from 'reactstrap' import { useRecoilState } from 'recoil' - +import styled from 'styled-components' import { localeAtom } from 'src/state/locale.state' import { getLocaleWithKey, Locale, localesArray } from 'src/i18n/i18n' @@ -14,11 +20,11 @@ export function LanguageSwitcher({ ...restProps }: LanguageSwitcherProps) { const setLocaleLocal = useCallback((locale: Locale) => () => setCurrentLocale(locale.key), [setCurrentLocale]) return ( - + - + - + {localesArray.map((locale) => { const isCurrent = locale.key === currentLocale return ( @@ -33,20 +39,42 @@ export function LanguageSwitcher({ ...restProps }: LanguageSwitcherProps) { } export function LanguageSwitcherItem({ locale }: { locale: string }) { - const { Flag, name, native } = getLocaleWithKey(locale) - - const label = useMemo(() => { - if (name === native) { - return name - } - - return `${native} (${name})` + const { name, native } = getLocaleWithKey(locale) + const { label, tooltip } = useMemo(() => { + return { label: `(${native})`, tooltip: `${name} (${native})` } }, [name, native]) - return ( - <> - - {label} - + + + {label} + ) } + +export function LabelShort({ locale, ...restProps }: { locale: string; className?: string }) { + const { key } = getLocaleWithKey(locale) + return {key} +} + +const LabelShortText = styled.span` + font-family: ${(props) => props.theme.font.monospace}; + text-transform: uppercase !important; + color: unset !important; +` + +const Dropdown = styled(DropdownBase)` + padding: 0; + margin: 0; +` + +const DropdownToggle = styled(DropdownToggleBase)` + color: ${(props) => props.theme.bodyColor}; + padding: 0; + margin: 0; +` + +const DropdownMenu = styled(DropdownMenuBase)` + background-color: ${(props) => props.theme.bodyBg}; + box-shadow: 1px 1px 20px 0 #0005; + transition: opacity ease-out 0.25s; +` diff --git a/packages_rs/nextclade-web/src/components/Layout/Layout.tsx b/packages_rs/nextclade-web/src/components/Layout/Layout.tsx new file mode 100644 index 000000000..ef0d56bb6 --- /dev/null +++ b/packages_rs/nextclade-web/src/components/Layout/Layout.tsx @@ -0,0 +1,49 @@ +import React, { PropsWithChildren, HTMLProps } from 'react' +import styled from 'styled-components' + +import { NavigationBar } from './NavigationBar' +import { Footer } from './Footer' +import { UpdateNotification } from './UpdateNotification' + +const Container = styled.div` + display: flex; + flex-direction: column; + height: 100%; + width: 100%; + padding: 0; + margin: 0; +` + +const HeaderWrapper = styled.header` + height: 45px; +` + +const MainWrapper = styled.main` + flex: auto; + overflow: hidden; + height: 100%; + width: 100%; + padding: 0; + margin: 0; +` + +const FooterWrapper = styled.footer`` + +export function Layout({ children }: PropsWithChildren>) { + return ( + + + + + + + + {children} + + + +