From 7810d38070a177f9da351ac63edc96a0bb2200dc Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 17 May 2024 14:08:41 +0100 Subject: [PATCH 01/13] in progress --- src/directsketch.rs | 282 +++++++++++++++++- src/lib.rs | 35 ++- .../sourmash_plugin_directsketch/__init__.py | 71 ++++- src/utils.rs | 28 +- 4 files changed, 402 insertions(+), 14 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index 25ca6f3..23eaf53 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -23,7 +23,8 @@ use sourmash::manifest::{Manifest, Record}; use sourmash::signature::Signature; use crate::utils::{ - build_siginfo, load_gbassembly_info, parse_params_str, GBAssemblyData, GenBankFileType, + build_siginfo, load_gbassembly_info, parse_params_str, AccessionData, GBAssemblyData, + GenBankFileType, }; use reqwest::Url; @@ -246,7 +247,7 @@ pub struct FailedDownload { } #[allow(clippy::too_many_arguments)] -async fn dl_sketch_accession( +async fn dl_sketch_assembly_accession( client: &Client, accinfo: GBAssemblyData, location: &PathBuf, @@ -372,6 +373,105 @@ async fn dl_sketch_accession( Ok((sigs, failed)) } +#[allow(clippy::too_many_arguments)] +async fn dl_sketch_url( + client: &Client, + accinfo: AccessionData, + location: &PathBuf, + retry: Option, + keep_fastas: bool, + dna_sigs: Vec, + prot_sigs: Vec, + genomes_only: bool, + proteomes_only: bool, + download_only: bool, +) -> Result<(Vec, Vec)> { + let retry_count = retry.unwrap_or(3); // Default retry count + let mut sigs = Vec::::new(); + let mut failed = Vec::::new(); + + let name = accinfo.name; + let accession = accinfo.accession; + let url = accinfo.url; + let expected_md5 = accinfo.expected_md5sum; + let download_filename = accinfo.download_filename; + let input_moltype = accinfo.input_moltype; + + // let mut file_types = vec![ + // GenBankFileType::Genomic, + // GenBankFileType::Protein, + // // GenBankFileType::AssemblyReport, + // ]; + // if genomes_only { + // file_types = vec![GenBankFileType::Genomic]; + // } else if proteomes_only { + // file_types = vec![GenBankFileType::Protein]; + // } + + let data = match download_with_retry( + client, + &url, + expected_md5.map(|x| x.as_str()), + retry_count, + ) + .await + { + Ok(data) => data, + Err(_err) => { + // here --> keep track of accession errors + filetype + let failed_download = FailedDownload { + accession: accession.clone(), + name: accinfo.name.clone(), + url: Some(accinfo.url), + moltype: input_moltype, + }; + failed.push(failed_download); + } + }; + + for file_type in &file_types { + let url = file_type.url(&base_url, &full_name); + let expected_md5 = checksums.get(&file_type.server_filename(&full_name)); + + let file_name = file_type.filename_to_write(&accession); + + if keep_fastas { + let path = location.join(&file_name); + fs::write(&path, &data).context("Failed to write data to file")?; + } + if !download_only { + // sketch data + match file_type { + GenBankFileType::Genomic => sigs.extend( + sketch_data( + name.clone(), + file_name.clone(), + data, + dna_sigs.clone(), + "dna".to_string(), + ) + .await?, + ), + GenBankFileType::Protein => { + sigs.extend( + sketch_data( + name.clone(), + file_name.clone(), + data, + prot_sigs.clone(), + "protein".to_string(), + ) + .await?, + ); + } + _ => {} // Do nothing for other file types + }; + } + } + + Ok((sigs, failed)) +} + async fn write_sig( sig: &Signature, md5sum_occurrences: &mut HashMap, @@ -568,7 +668,7 @@ pub fn error_handler( #[tokio::main] #[allow(clippy::too_many_arguments)] -pub async fn download_and_sketch( +pub async fn gbsketch( py: Python, input_csv: String, param_str: String, @@ -688,7 +788,181 @@ pub async fn download_and_sketch( ); } // Perform download and sketch - let result = dl_sketch_accession( + let result = dl_sketch_assembly_accession( + &client_clone, + accinfo.clone(), + &download_path_clone, + Some(retry_times), + keep_fastas, + dna_sigs, + prot_sigs, + genomes_only, + proteomes_only, + download_only, + ) + .await; + match result { + Ok((sigs, failed_downloads)) => { + if !sigs.is_empty() { + if let Err(e) = send_sigs.send(sigs).await { + eprintln!("Failed to send signatures: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel + } + } + for fail in failed_downloads { + if let Err(e) = send_failed.send(fail).await { + eprintln!("Failed to send failed download info: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel + } + } + } + Err(e) => { + let _ = send_errors.send(e).await; + } + } + drop(send_errors); + }); + } + // drop senders as we're done sending data + drop(send_sigs); + drop(send_failed); + drop(error_sender); + // Wait for all tasks to complete + for handle in handles { + if let Err(e) = handle.await { + eprintln!("Handle join error: {}.", e); + } + } + // since the only critical error is not having written any sigs + // check this here at end. Bail if we wrote expected sigs but wrote none. + if critical_error_flag.load(Ordering::SeqCst) & !download_only { + bail!("No signatures written, exiting."); + } + + Ok(()) +} + +#[tokio::main] +#[allow(clippy::too_many_arguments)] +pub async fn urlsketch( + py: Python, + input_csv: String, + param_str: String, + failed_csv: String, + retry_times: u32, + fasta_location: String, + keep_fastas: bool, + download_only: bool, + output_sigs: Option, +) -> Result<(), anyhow::Error> { + // if sig output provided but doesn't end in zip, bail + if let Some(ref output_sigs) = output_sigs { + if Path::new(&output_sigs) + .extension() + .map_or(true, |ext| ext != "zip") + { + bail!("Output must be a zip file."); + } + } + // set up fasta download path + let download_path = PathBuf::from(fasta_location); + if !download_path.exists() { + create_dir_all(&download_path)?; + } + + // create channels. buffer size here is 4 b/c we can do 3 downloads simultaneously + let (send_sigs, recv_sigs) = tokio::sync::mpsc::channel::>(4); + let (send_failed, recv_failed) = tokio::sync::mpsc::channel::(4); + // Error channel for handling task errors + let (error_sender, error_receiver) = tokio::sync::mpsc::channel::(1); + + // Set up collector/writing tasks + let mut handles = Vec::new(); + let sig_handle = sigwriter_handle(recv_sigs, output_sigs, error_sender.clone()); + let failures_handle = failures_handle(failed_csv, recv_failed, error_sender.clone()); + let critical_error_flag = Arc::new(AtomicBool::new(false)); + let error_handle = error_handler(error_receiver, critical_error_flag.clone()); + handles.push(sig_handle); + handles.push(failures_handle); + handles.push(error_handle); + + // Worker tasks + let semaphore = Arc::new(Semaphore::new(3)); // Limiting concurrent downloads + let client = Arc::new(Client::new()); + + // Open the file containing the accessions synchronously + let (accession_info, n_accs) = load_gbassembly_info(input_csv)?; + if n_accs == 0 { + bail!("No accessions to download and sketch.") + } + + // parse param string into params_vec, print error if fail + let param_result = parse_params_str(param_str); + let params_vec = match param_result { + Ok(params) => params, + Err(e) => { + bail!("Failed to parse params string: {}", e); + } + }; + let dna_sig_templates = build_siginfo(¶ms_vec, "DNA"); + let prot_sig_templates = build_siginfo(¶ms_vec, "protein"); + + let mut genomes_only = false; + let mut proteomes_only = false; + + // Check if dna_sig_templates is empty and not keep_fastas + if dna_sig_templates.is_empty() && !keep_fastas { + eprintln!("No DNA signature templates provided, and --keep-fastas is not set."); + proteomes_only = true; + } + // Check if protein_sig_templates is empty and not keep_fastas + if prot_sig_templates.is_empty() && !keep_fastas { + eprintln!("No protein signature templates provided, and --keep-fastas is not set."); + genomes_only = true; + } + if genomes_only { + if !download_only { + eprintln!("Downloading and sketching genomes only."); + } else { + eprintln!("Downloading genomes only."); + } + } else if proteomes_only { + if !download_only { + eprintln!("Downloading and sketching proteomes only."); + } else { + eprintln!("Downloading proteomes only."); + } + } + + // report every 1 percent (or every 1, whichever is larger) + let reporting_threshold = std::cmp::max(n_accs / 100, 1); + + for (i, accinfo) in accession_info.into_iter().enumerate() { + py.check_signals()?; // If interrupted, return an Err automatically + let semaphore_clone = Arc::clone(&semaphore); + let client_clone = Arc::clone(&client); + let send_sigs = send_sigs.clone(); + let send_failed = send_failed.clone(); + let download_path_clone = download_path.clone(); // Clone the path for each task + let send_errors = error_sender.clone(); + + let dna_sigs = dna_sig_templates.clone(); + let prot_sigs = prot_sig_templates.clone(); + + tokio::spawn(async move { + let _permit = semaphore_clone.acquire().await; + // progress report when the permit is available and processing begins + if (i + 1) % reporting_threshold == 0 { + let percent_processed = (((i + 1) as f64 / n_accs as f64) * 100.0).round(); + println!( + "Starting accession {}/{} ({}%)", + (i + 1), + n_accs, + percent_processed + ); + } + // Perform download and sketch + let result = dl_sketch_url( &client_clone, accinfo.clone(), &download_path_clone, diff --git a/src/lib.rs b/src/lib.rs index 7d7b7e4..88e77ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -62,7 +62,7 @@ fn do_gbsketch( download_only: bool, output_sigs: Option, ) -> anyhow::Result { - match directsketch::download_and_sketch( + match directsketch::gbsketch( py, input_csv, param_str, @@ -83,9 +83,42 @@ fn do_gbsketch( } } +#[pyfunction] +#[allow(clippy::too_many_arguments)] +fn do_urlsketch( + py: Python, + input_csv: String, + param_str: String, + failed_csv: String, + retry_times: u32, + fasta_location: String, + keep_fastas: bool, + download_only: bool, + output_sigs: Option, +) -> anyhow::Result { + match directsketch::urlsketch( + py, + input_csv, + param_str, + failed_csv, + retry_times, + fasta_location, + keep_fastas, + download_only, + output_sigs, + ) { + Ok(_) => Ok(0), + Err(e) => { + eprintln!("Error: {e}"); + Ok(1) + } + } +} + #[pymodule] fn sourmash_plugin_directsketch(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(do_gbsketch, m)?)?; + m.add_function(wrap_pyfunction!(do_urlsketch, m)?)?; m.add_function(wrap_pyfunction!(set_tokio_thread_pool, m)?)?; Ok(()) } diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index bdeade4..beacbe3 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -43,7 +43,7 @@ def __init__(self, p): help='output zip file for the signatures') p.add_argument('-f', '--fastas', help='Write fastas here', default = '.') - p.add_argument('-k', '--keep-fastas', action='store_true', + p.add_argument('-k', '--keep-fasta', action='store_true', help="write FASTA files in addition to sketching. Default: do not write FASTA files") p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') p.add_argument('--failed',help='csv of failed accessions and download links (should be mostly protein).') @@ -86,7 +86,74 @@ def main(self, args): args.failed, args.retry_times, args.fastas, - args.keep_fastas, + args.keep_fasta, + args.genomes_only, + args.proteomes_only, + args.download_only, + args.output) + + if status == 0: + notify("...gbsketch is done!") + if args.output is not None: + notify(f"Sigs in '{args.output}'.") + if args.keep_fastas: + notify(f"FASTAs in '{args.fastas}'.") + + return status + + +class Download_and_Sketch_Url(CommandLinePlugin): + command = 'urlsketch' + description = 'download and sketch GenBank assembly datasets' + + def __init__(self, p): + super().__init__(p) + p.add_argument('input_csv', help="a txt file or csv file containing accessions in the first column") + p.add_argument('-o', '--output', default=None, + help='output zip file for the signatures') + p.add_argument('-f', '--fastas', + help='Write fastas here', default = '.') + p.add_argument('-k', '--keep-fasta', '--keep-fastq', action='store_true', + help="write FASTA/Q files in addition to sketching. Default: do not write FASTA files") + p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') + p.add_argument('--failed',help='csv of failed accessions and download links (should be mostly protein).') + p.add_argument('-p', '--param-string', action='append', type=str, default=[], + help='parameter string for sketching (default: k=31,scaled=1000)') + p.add_argument('-c', '--cores', default=0, type=int, + help='number of cores to use (default is all available)') + p.add_argument('-r', '--retry-times', default=1, type=int, + help='number of times to retry failed downloads') + + + def main(self, args): + print_version() + if not args.param_string: + args.param_string = ["k=31,scaled=1000"] + notify(f"params: {args.param_string}") + + if args.download_only and not args.keep_fastas: + notify("Error: '--download-only' requires '--keep-fasta'.") + sys.exit(-1) + if args.output is None and not args.download_only: + notify("Error: output signature zipfile is required if not using '--download-only'.") + sys.exit(-1) + + # convert to a single string for easier rust handling + args.param_string = "_".join(args.param_string) + # lowercase the param string + args.param_string = args.param_string.lower() + + num_threads = set_thread_pool(args.cores) + + notify(f"downloading and sketching all accessions in '{args.input_csv} using {num_threads} threads") + + super().main(args) + status = sourmash_plugin_directsketch.do_gbsketch(args.input_csv, + args.param_string, + args.failed, + args.retry_times, + args.fastas, + args.keep_fasta, args.genomes_only, args.proteomes_only, args.download_only, diff --git a/src/utils.rs b/src/utils.rs index 63dc9b7..32a14ed 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -79,7 +79,8 @@ pub struct AccessionData { pub name: String, pub input_moltype: InputMolType, pub url: reqwest::Url, - pub md5sum: Option, + pub expected_md5sum: Option, + pub download_filename: Option, // need to require this if --keep-fastas are used } #[derive(Clone)] @@ -161,7 +162,10 @@ pub fn load_gbassembly_info(input_csv: String) -> Result<(Vec, u } #[allow(dead_code)] -pub fn load_accession_info(input_csv: String) -> Result<(Vec, usize)> { +pub fn load_accession_info( + input_csv: String, + keep_fasta: bool, +) -> Result<(Vec, usize)> { let mut results = Vec::new(); let mut row_count = 0; let mut processed_rows = std::collections::HashSet::new(); @@ -218,19 +222,29 @@ pub fn load_accession_info(input_csv: String) -> Result<(Vec, usi }) .next() .ok_or_else(|| anyhow!("Invalid 'url' value"))?; - let md5sum = record.get(4).map(|s| s.to_string()); + let expected_md5sum = record.get(4).map(|s| s.to_string()); + let mut download_filename = None; + if keep_fasta { + download_filename = Some( + record + .get(5) + .ok_or_else(|| anyhow!("Missing 'download_filename' field"))? + .to_string(), + ); + } // count entries with url and md5sum - if md5sum.is_some() { + if expected_md5sum.is_some() { md5sum_count += 1; } // store accession data results.push(AccessionData { - accession: acc.to_string(), - name: name.to_string(), + accession: acc, + name: name, input_moltype, url, - md5sum, + expected_md5sum, + download_filename, }); } From 5b298305487a21fe751ebf3dc676001cf220c80c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 19 May 2024 08:53:41 +0100 Subject: [PATCH 02/13] compiling version --- src/directsketch.rs | 104 +++++++++++++++++++------------------------- src/utils.rs | 9 ++++ 2 files changed, 53 insertions(+), 60 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index 23eaf53..2cb4820 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -23,8 +23,8 @@ use sourmash::manifest::{Manifest, Record}; use sourmash::signature::Signature; use crate::utils::{ - build_siginfo, load_gbassembly_info, parse_params_str, AccessionData, GBAssemblyData, - GenBankFileType, + build_siginfo, load_accession_info, load_gbassembly_info, parse_params_str, AccessionData, + GBAssemblyData, GenBankFileType, InputMolType, }; use reqwest::Url; @@ -397,75 +397,59 @@ async fn dl_sketch_url( let download_filename = accinfo.download_filename; let input_moltype = accinfo.input_moltype; - // let mut file_types = vec![ - // GenBankFileType::Genomic, - // GenBankFileType::Protein, - // // GenBankFileType::AssemblyReport, - // ]; - // if genomes_only { - // file_types = vec![GenBankFileType::Genomic]; - // } else if proteomes_only { - // file_types = vec![GenBankFileType::Protein]; - // } - - let data = match download_with_retry( + match download_with_retry( client, &url, - expected_md5.map(|x| x.as_str()), + expected_md5.as_ref().map(|x| x.as_str()), retry_count, ) .await + .ok() { - Ok(data) => data, - Err(_err) => { - // here --> keep track of accession errors + filetype - let failed_download = FailedDownload { - accession: accession.clone(), - name: accinfo.name.clone(), - url: Some(accinfo.url), - moltype: input_moltype, - }; - failed.push(failed_download); - } - }; - - for file_type in &file_types { - let url = file_type.url(&base_url, &full_name); - let expected_md5 = checksums.get(&file_type.server_filename(&full_name)); - - let file_name = file_type.filename_to_write(&accession); - - if keep_fastas { - let path = location.join(&file_name); - fs::write(&path, &data).context("Failed to write data to file")?; - } - if !download_only { - // sketch data - match file_type { - GenBankFileType::Genomic => sigs.extend( - sketch_data( - name.clone(), - file_name.clone(), - data, - dna_sigs.clone(), - "dna".to_string(), - ) - .await?, - ), - GenBankFileType::Protein => { - sigs.extend( + Some(data) => { + // check keep_fastas instead?? + if let Some(download_filename) = download_filename { + let path = location.join(&download_filename); + fs::write(&path, &data).context("Failed to write data to file")?; + } + if !download_only { + // let filename = download_filename.clone().unwrap(); + let filename = "".to_string(); + // sketch data + match input_moltype { + InputMolType::Dna => sigs.extend( sketch_data( name.clone(), - file_name.clone(), + filename.clone(), data, - prot_sigs.clone(), - "protein".to_string(), + dna_sigs.clone(), + "dna".to_string(), ) .await?, - ); - } - _ => {} // Do nothing for other file types + ), + InputMolType::Protein => { + sigs.extend( + sketch_data( + name.clone(), + filename.clone(), + data, + prot_sigs.clone(), + "protein".to_string(), + ) + .await?, + ); + } + }; + } + } + None => { + let failed_download = FailedDownload { + accession: accession.clone(), + name: name.clone(), + url: Some(url), + moltype: input_moltype.to_string(), }; + failed.push(failed_download); } } @@ -891,7 +875,7 @@ pub async fn urlsketch( let client = Arc::new(Client::new()); // Open the file containing the accessions synchronously - let (accession_info, n_accs) = load_gbassembly_info(input_csv)?; + let (accession_info, n_accs) = load_accession_info(input_csv, keep_fastas)?; if n_accs == 0 { bail!("No accessions to download and sketch.") } diff --git a/src/utils.rs b/src/utils.rs index 32a14ed..7a65cc3 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -11,6 +11,15 @@ pub enum InputMolType { Protein, } +impl InputMolType { + pub fn to_string(&self) -> String { + match self { + InputMolType::Dna => "dna".to_string(), + InputMolType::Protein => "protein".to_string(), + } + } +} + impl std::str::FromStr for InputMolType { type Err = (); From 847e436fe1d0eea082d28c11fccf4bbbb5af8054 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 19 May 2024 09:02:19 +0100 Subject: [PATCH 03/13] init testing --- src/utils.rs | 32 ++++++++++++--------- tests/test-data/acc-url.csv | 4 +++ tests/test_urlsketch.py | 57 +++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 13 deletions(-) create mode 100644 tests/test-data/acc-url.csv create mode 100644 tests/test_urlsketch.py diff --git a/src/utils.rs b/src/utils.rs index 7a65cc3..48f6e50 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -185,7 +185,14 @@ pub fn load_accession_info( // Check column names let header = rdr.headers()?; - let expected_header = vec!["accession", "name", "input_moltype", "url", "md5sum"]; + let expected_header = vec![ + "accession", + "name", + "input_moltype", + "md5sum", + "download_filename", + "url", + ]; if header != expected_header { return Err(anyhow!( "Invalid column names in CSV file. Columns should be: {:?}", @@ -217,8 +224,18 @@ pub fn load_accession_info( .ok_or_else(|| anyhow!("Missing 'input_moltype' field"))? .parse::() .map_err(|_| anyhow!("Invalid 'input_moltype' value"))?; + let expected_md5sum = record.get(3).map(|s| s.to_string()); + let mut download_filename = None; + if keep_fasta { + download_filename = Some( + record + .get(4) + .ok_or_else(|| anyhow!("Missing 'download_filename' field"))? + .to_string(), + ); + } let url = record - .get(3) + .get(5) .ok_or_else(|| anyhow!("Missing 'url' field"))? .split(',') .filter_map(|s| { @@ -231,17 +248,6 @@ pub fn load_accession_info( }) .next() .ok_or_else(|| anyhow!("Invalid 'url' value"))?; - let expected_md5sum = record.get(4).map(|s| s.to_string()); - let mut download_filename = None; - if keep_fasta { - download_filename = Some( - record - .get(5) - .ok_or_else(|| anyhow!("Missing 'download_filename' field"))? - .to_string(), - ); - } - // count entries with url and md5sum if expected_md5sum.is_some() { md5sum_count += 1; diff --git a/tests/test-data/acc-url.csv b/tests/test-data/acc-url.csv new file mode 100644 index 0000000..692c21d --- /dev/null +++ b/tests/test-data/acc-url.csv @@ -0,0 +1,4 @@ +accession,name,input_moltype,md5sum,download_filename,url +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,, diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py new file mode 100644 index 0000000..e3849bb --- /dev/null +++ b/tests/test_urlsketch.py @@ -0,0 +1,57 @@ +""" +Tests for urlsketch +""" +import os +import pytest + +import sourmash +import sourmash_tst_utils as utils +from sourmash_tst_utils import SourmashCommandFailed + + + +def get_test_data(filename): + thisdir = os.path.dirname(__file__) + return os.path.join(thisdir, 'test-data', filename) + +def test_installed(runtmp): + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch') + + assert 'usage: urlsketch' in runtmp.last_result.err + + +def test_urlsketch_simple(runtmp): + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + if 'GCA_000175535.1' in sig.name: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + elif 'GCA_000961135.2' in sig.name: + assert sig.name == ss2.name + if sig.minhash.moltype == 'DNA': + assert sig.md5sum() == ss2.md5sum() + else: + assert sig.md5sum() == ss3.md5sum() From c2efab6f8c8e66d69fc2cad748b243f6ef2456dc Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 19 May 2024 09:09:02 +0100 Subject: [PATCH 04/13] add entrypoint; working test --- pyproject.toml | 1 + src/python/sourmash_plugin_directsketch/__init__.py | 6 ++---- tests/test-data/acc-url.csv | 6 +++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a1255d6..25ce6f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ build-backend = "maturin" [project.entry-points."sourmash.cli_script"] gbsketch = "sourmash_plugin_directsketch:Download_and_Sketch_Assemblies" +urlsketch = "sourmash_plugin_directsketch:Download_and_Sketch_Url" [project.optional-dependencies] test = [ diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index beacbe3..40a0a15 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -148,14 +148,12 @@ def main(self, args): notify(f"downloading and sketching all accessions in '{args.input_csv} using {num_threads} threads") super().main(args) - status = sourmash_plugin_directsketch.do_gbsketch(args.input_csv, + status = sourmash_plugin_directsketch.do_urlsketch(args.input_csv, args.param_string, args.failed, args.retry_times, args.fastas, args.keep_fasta, - args.genomes_only, - args.proteomes_only, args.download_only, args.output) @@ -163,7 +161,7 @@ def main(self, args): notify("...gbsketch is done!") if args.output is not None: notify(f"Sigs in '{args.output}'.") - if args.keep_fastas: + if args.keep_fasta: notify(f"FASTAs in '{args.fastas}'.") return status diff --git a/tests/test-data/acc-url.csv b/tests/test-data/acc-url.csv index 692c21d..dae1c74 100644 --- a/tests/test-data/acc-url.csv +++ b/tests/test-data/acc-url.csv @@ -1,4 +1,4 @@ accession,name,input_moltype,md5sum,download_filename,url -GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz -GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz -GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,, +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz From 12c1343c263ceda87a4bcf14580b0f6cb1c5c452 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 19 May 2024 09:20:35 +0100 Subject: [PATCH 05/13] clippy fixes + arg compatibility --- src/directsketch.rs | 25 ++++++++----------- .../sourmash_plugin_directsketch/__init__.py | 6 ++--- src/utils.rs | 13 ++++++---- tests/test_gbsketch.py | 8 +++--- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index 2cb4820..a5e513b 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -379,11 +379,11 @@ async fn dl_sketch_url( accinfo: AccessionData, location: &PathBuf, retry: Option, - keep_fastas: bool, + _keep_fastas: bool, dna_sigs: Vec, prot_sigs: Vec, - genomes_only: bool, - proteomes_only: bool, + _genomes_only: bool, + _proteomes_only: bool, download_only: bool, ) -> Result<(Vec, Vec)> { let retry_count = retry.unwrap_or(3); // Default retry count @@ -397,20 +397,15 @@ async fn dl_sketch_url( let download_filename = accinfo.download_filename; let input_moltype = accinfo.input_moltype; - match download_with_retry( - client, - &url, - expected_md5.as_ref().map(|x| x.as_str()), - retry_count, - ) - .await - .ok() + match download_with_retry(client, &url, expected_md5.as_deref(), retry_count) + .await + .ok() { Some(data) => { // check keep_fastas instead?? if let Some(download_filename) = download_filename { - let path = location.join(&download_filename); - fs::write(&path, &data).context("Failed to write data to file")?; + let path = location.join(download_filename); + fs::write(path, &data).context("Failed to write data to file")?; } if !download_only { // let filename = download_filename.clone().unwrap(); @@ -722,12 +717,12 @@ pub async fn gbsketch( // Check if dna_sig_templates is empty and not keep_fastas if dna_sig_templates.is_empty() && !keep_fastas { - eprintln!("No DNA signature templates provided, and --keep-fastas is not set."); + eprintln!("No DNA signature templates provided, and --keep-fasta is not set."); proteomes_only = true; } // Check if protein_sig_templates is empty and not keep_fastas if prot_sig_templates.is_empty() && !keep_fastas { - eprintln!("No protein signature templates provided, and --keep-fastas is not set."); + eprintln!("No protein signature templates provided, and --keep-fasta is not set."); genomes_only = true; } if genomes_only { diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index 40a0a15..33c29a5 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -64,8 +64,8 @@ def main(self, args): args.param_string = ["k=31,scaled=1000"] notify(f"params: {args.param_string}") - if args.download_only and not args.keep_fastas: - notify("Error: '--download-only' requires '--keep-fastas'.") + if args.download_only and not args.keep_fasta: + notify("Error: '--download-only' requires '--keep-fasta'.") sys.exit(-1) if args.output is None and not args.download_only: notify("Error: output signature zipfile is required if not using '--download-only'.") @@ -96,7 +96,7 @@ def main(self, args): notify("...gbsketch is done!") if args.output is not None: notify(f"Sigs in '{args.output}'.") - if args.keep_fastas: + if args.keep_fasta: notify(f"FASTAs in '{args.fastas}'.") return status diff --git a/src/utils.rs b/src/utils.rs index 48f6e50..226bae4 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -2,6 +2,7 @@ use anyhow::{anyhow, Result}; use reqwest::Url; use sourmash::cmd::ComputeParameters; use sourmash::signature::Signature; +use std::fmt; use std::hash::Hash; use std::hash::Hasher; @@ -11,11 +12,13 @@ pub enum InputMolType { Protein, } -impl InputMolType { - pub fn to_string(&self) -> String { +impl InputMolType {} + +impl fmt::Display for InputMolType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - InputMolType::Dna => "dna".to_string(), - InputMolType::Protein => "protein".to_string(), + InputMolType::Dna => write!(f, "dna"), + InputMolType::Protein => write!(f, "protein"), } } } @@ -255,7 +258,7 @@ pub fn load_accession_info( // store accession data results.push(AccessionData { accession: acc, - name: name, + name, input_moltype, url, expected_md5sum, diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index c3648b9..1301100 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -177,7 +177,7 @@ def test_gbsketch_genomes_only_via_params(runtmp, capfd): elif 'GCA_000961135.2' in sig.name: assert sig.name == ss2.name assert sig.md5sum() == ss2.md5sum() - assert 'No protein signature templates provided, and --keep-fastas is not set.' in captured.err + assert 'No protein signature templates provided, and --keep-fasta is not set.' in captured.err assert 'Downloading and sketching genomes only.' in captured.err @@ -206,7 +206,7 @@ def test_gbsketch_proteomes_only_via_params(runtmp, capfd): for sig in sigs: assert 'GCA_000961135.2' in sig.name assert sig.md5sum() == ss3.md5sum() - assert 'No DNA signature templates provided, and --keep-fastas is not set.' in captured.err + assert 'No DNA signature templates provided, and --keep-fasta is not set.' in captured.err assert 'Downloading and sketching proteomes only.' in captured.err @@ -226,7 +226,7 @@ def test_gbsketch_save_fastas(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fastas', + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -265,7 +265,7 @@ def test_gbsketch_download_only(runtmp, capfd): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '--download-only', - '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fastas', + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert not runtmp.last_result.out # stdout should be empty From e2c4acca57d1f39eb733ca835c063af1c4ce2dab Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 May 2024 08:59:52 +0100 Subject: [PATCH 06/13] update failure format --- src/directsketch.rs | 35 ++++++++++++++++++++++++++--------- tests/test_gbsketch.py | 9 ++++++--- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index 7d38cca..ee55968 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -242,8 +242,10 @@ async fn sketch_data( pub struct FailedDownload { accession: String, name: String, - url: Option, moltype: String, + md5sum: Option, + download_filename: Option, + url: Option, } #[allow(clippy::too_many_arguments)] @@ -276,8 +278,10 @@ async fn dl_sketch_assembly_accession( let failed_download_dna = FailedDownload { accession: accession.clone(), name: name.clone(), - url: None, moltype: "dna".to_string(), + md5sum: None, + download_filename: None, + url: None, }; failed.push(failed_download_dna); } @@ -285,8 +289,10 @@ async fn dl_sketch_assembly_accession( let failed_download_protein = FailedDownload { accession: accession.clone(), name: name.clone(), - url: None, moltype: "protein".to_string(), + md5sum: None, + download_filename: None, + url: None, }; failed.push(failed_download_protein); } @@ -317,6 +323,7 @@ async fn dl_sketch_assembly_accession( for file_type in &file_types { let url = file_type.url(&base_url, &full_name); let expected_md5 = checksums.get(&file_type.server_filename(&full_name)); + let file_name = file_type.filename_to_write(&accession); let data = match download_with_retry(client, &url, expected_md5.map(|x| x.as_str()), retry_count) .await @@ -327,14 +334,15 @@ async fn dl_sketch_assembly_accession( let failed_download = FailedDownload { accession: accession.clone(), name: name.clone(), - url: Some(url), moltype: file_type.moltype(), + md5sum: expected_md5.map(|x| x.to_string()), + download_filename: Some(file_name), + url: Some(url), }; failed.push(failed_download); continue; } }; - let file_name = file_type.filename_to_write(&accession); if keep_fastas { let path = location.join(&file_name); @@ -441,8 +449,10 @@ async fn dl_sketch_url( let failed_download = FailedDownload { accession: accession.clone(), name: name.clone(), - url: Some(url), moltype: input_moltype.to_string(), + md5sum: expected_md5.map(|x| x.to_string()), + download_filename: download_filename, + url: Some(url), }; failed.push(failed_download); } @@ -593,7 +603,10 @@ pub fn failures_handle( let mut writer = BufWriter::new(file); // Attempt to write CSV headers - if let Err(e) = writer.write_all(b"accession,name,moltype,url\n").await { + if let Err(e) = writer + .write_all(b"accession,name,moltype,md5sum,download_filename,url\n") + .await + { let error = Error::new(e).context("Failed to write headers"); let _ = error_sender.send(error).await; return; // Exit the task early after reporting the error @@ -602,15 +615,19 @@ pub fn failures_handle( while let Some(FailedDownload { accession, name, - moltype, + md5sum, + download_filename, url, + moltype, }) = recv_failed.recv().await { let record = format!( - "{},{},{},{:?}\n", + "{},{},{},{},{},{}\n", accession, name, moltype, + md5sum.unwrap_or("".to_string()), + download_filename.unwrap_or("".to_string()), url.map(|u| u.to_string()).unwrap_or("".to_string()) ); // Attempt to write each record diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index c285a75..4a5d04e 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -57,13 +57,16 @@ def test_gbsketch_simple(runtmp): assert sig.md5sum() == ss3.md5sum() assert os.path.exists(failed) with open(failed, 'r') as failF: - next(failF) # skip header line + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url" for line in failF: - acc, name, moltype, url = line.strip().split(',') + print(line) + acc, name, moltype, md5sum, download_filename, url = line.strip().split(',') assert acc == "GCA_000175535.1" assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" assert moltype == "protein" - assert url == '"https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"' + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" def test_gbsketch_simple_url(runtmp): From 66e959f65ad38ee7f29d6945f9368ec8d53abcbd Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 May 2024 09:07:00 +0100 Subject: [PATCH 07/13] unify failure output format, urlsketch input format --- src/directsketch.rs | 6 +++--- src/utils.rs | 12 ++++++------ tests/test-data/acc-url.csv | 2 +- tests/test_urlsketch.py | 12 ++++++++++++ 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index ee55968..90790b8 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -403,7 +403,7 @@ async fn dl_sketch_url( let url = accinfo.url; let expected_md5 = accinfo.expected_md5sum; let download_filename = accinfo.download_filename; - let input_moltype = accinfo.input_moltype; + let moltype = accinfo.moltype; match download_with_retry(client, &url, expected_md5.as_deref(), retry_count) .await @@ -419,7 +419,7 @@ async fn dl_sketch_url( // let filename = download_filename.clone().unwrap(); let filename = "".to_string(); // sketch data - match input_moltype { + match moltype { InputMolType::Dna => sigs.extend( sketch_data( name.clone(), @@ -449,7 +449,7 @@ async fn dl_sketch_url( let failed_download = FailedDownload { accession: accession.clone(), name: name.clone(), - moltype: input_moltype.to_string(), + moltype: moltype.to_string(), md5sum: expected_md5.map(|x| x.to_string()), download_filename: download_filename, url: Some(url), diff --git a/src/utils.rs b/src/utils.rs index 226bae4..d6618d5 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -89,7 +89,7 @@ impl GenBankFileType { pub struct AccessionData { pub accession: String, pub name: String, - pub input_moltype: InputMolType, + pub moltype: InputMolType, pub url: reqwest::Url, pub expected_md5sum: Option, pub download_filename: Option, // need to require this if --keep-fastas are used @@ -191,7 +191,7 @@ pub fn load_accession_info( let expected_header = vec![ "accession", "name", - "input_moltype", + "moltype", "md5sum", "download_filename", "url", @@ -222,11 +222,11 @@ pub fn load_accession_info( .get(1) .ok_or_else(|| anyhow!("Missing 'name' field"))? .to_string(); - let input_moltype = record + let moltype = record .get(2) - .ok_or_else(|| anyhow!("Missing 'input_moltype' field"))? + .ok_or_else(|| anyhow!("Missing 'moltype' field"))? .parse::() - .map_err(|_| anyhow!("Invalid 'input_moltype' value"))?; + .map_err(|_| anyhow!("Invalid 'moltype' value"))?; let expected_md5sum = record.get(3).map(|s| s.to_string()); let mut download_filename = None; if keep_fasta { @@ -259,7 +259,7 @@ pub fn load_accession_info( results.push(AccessionData { accession: acc, name, - input_moltype, + moltype, url, expected_md5sum, download_filename, diff --git a/tests/test-data/acc-url.csv b/tests/test-data/acc-url.csv index dae1c74..8c3a87e 100644 --- a/tests/test-data/acc-url.csv +++ b/tests/test-data/acc-url.csv @@ -1,4 +1,4 @@ -accession,name,input_moltype,md5sum,download_filename,url +accession,name,moltype,md5sum,download_filename,url GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index e3849bb..8c0af58 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -55,3 +55,15 @@ def test_urlsketch_simple(runtmp): assert sig.md5sum() == ss2.md5sum() else: assert sig.md5sum() == ss3.md5sum() + assert os.path.exists(failed) + with open(failed, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url = line.strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" From de12609ed525ca270d106e05d9ab2634ea02f260 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 May 2024 09:10:14 +0100 Subject: [PATCH 08/13] test keep fasta --- .../sourmash_plugin_directsketch/__init__.py | 2 +- tests/test_urlsketch.py | 39 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index 33c29a5..eebb794 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -78,7 +78,7 @@ def main(self, args): num_threads = set_thread_pool(args.cores) - notify(f"downloading and sketching all accessions in '{args.input_csv} using {num_threads} threads") + notify(f"downloading and sketching all accessions in '{args.input_csv} using {args.retry_times} retries and {num_threads} threads") super().main(args) status = sourmash_plugin_directsketch.do_gbsketch(args.input_csv, diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 8c0af58..80ac640 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -67,3 +67,42 @@ def test_urlsketch_simple(runtmp): assert moltype == "protein" assert download_filename == "GCA_000175535.1_protein.faa.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + +def test_urlsketch_save_fastas(runtmp): + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + fa_files = os.listdir(out_dir) + assert set(fa_files) == set(['GCA_000175535.1_genomic.urlsketch.fna.gz', 'GCA_000961135.2_protein.urlsketch.faa.gz', 'GCA_000961135.2_genomic.urlsketch.fna.gz']) + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + if 'GCA_000175535.1' in sig.name: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + elif 'GCA_000961135.2' in sig.name: + assert sig.name == ss2.name + if sig.minhash.moltype == 'DNA': + assert sig.md5sum() == ss2.md5sum() + else: + assert sig.md5sum() == ss3.md5sum() \ No newline at end of file From 52ff4962105230c47e81998ef73780ca882a3a7e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 May 2024 09:11:53 +0100 Subject: [PATCH 09/13] test download only --- .../sourmash_plugin_directsketch/__init__.py | 2 +- tests/test_urlsketch.py | 28 ++++++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index eebb794..bd52812 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -131,7 +131,7 @@ def main(self, args): args.param_string = ["k=31,scaled=1000"] notify(f"params: {args.param_string}") - if args.download_only and not args.keep_fastas: + if args.download_only and not args.keep_fasta: notify("Error: '--download-only' requires '--keep-fasta'.") sys.exit(-1) if args.output is None and not args.download_only: diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 80ac640..d140172 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -105,4 +105,30 @@ def test_urlsketch_save_fastas(runtmp): if sig.minhash.moltype == 'DNA': assert sig.md5sum() == ss2.md5sum() else: - assert sig.md5sum() == ss3.md5sum() \ No newline at end of file + assert sig.md5sum() == ss3.md5sum() + + +def test_urlsketch_download_only(runtmp, capfd): + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '--download-only', + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert not runtmp.last_result.out # stdout should be empty + fa_files = os.listdir(out_dir) + assert set(fa_files) == set(['GCA_000175535.1_genomic.urlsketch.fna.gz', 'GCA_000961135.2_protein.urlsketch.faa.gz', 'GCA_000961135.2_genomic.urlsketch.fna.gz']) + captured = capfd.readouterr() + assert "Failed to send signatures: channel closed" not in captured.err \ No newline at end of file From 34643ba37c53227bccbe26a1efea299de50ba00d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 May 2024 09:17:47 +0100 Subject: [PATCH 10/13] more tests --- tests/test_urlsketch.py | 137 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 1 deletion(-) diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index d140172..eaf846f 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -131,4 +131,139 @@ def test_urlsketch_download_only(runtmp, capfd): fa_files = os.listdir(out_dir) assert set(fa_files) == set(['GCA_000175535.1_genomic.urlsketch.fna.gz', 'GCA_000961135.2_protein.urlsketch.faa.gz', 'GCA_000961135.2_genomic.urlsketch.fna.gz']) captured = capfd.readouterr() - assert "Failed to send signatures: channel closed" not in captured.err \ No newline at end of file + assert "Failed to send signatures: channel closed" not in captured.err + + +def test_urlsketch_bad_acc(runtmp): + acc_csv = get_test_data('acc-url.csv') + acc_mod = runtmp.output('acc_mod.csv') + with open(acc_csv, 'r') as inF, open(acc_mod, 'w') as outF: + lines = inF.readlines() + for line in lines: + # if this acc exist in line, copy it and write an extra line with an invalid accession + outF.write(line) + print(line) + if "GCA_000175535.1" in line: + mod_line = line.replace('GCA_000175535.1', 'GCA_0001755559.1') # add extra digit - should not be valid + print(mod_line) + outF.write(mod_line) + + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + ss2 = sourmash.load_one_signature(sig2, ksize=31) + # why does this need ksize =30 and not ksize = 10!??? + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') + + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + # Open the failed file + assert os.path.exists(failed) + with open(failed, 'r') as acc_file: + # Read the lines of the file + lines = acc_file.readlines() + # Check if the modified accession exists in the first column of any line + for line in lines: + print(line) + if "GCA_0001755559.1" in line.split(',')[0]: + assert True + break + else: + assert False, "Modified accession not found" + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + if 'GCA_000175535.1' in sig.name: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + elif 'GCA_000961135.2' in sig.name: + assert sig.name == ss2.name + if sig.minhash.moltype == 'DNA': + assert sig.md5sum() == ss2.md5sum() + else: + assert sig.md5sum() == ss3.md5sum() + +def test_urlsketch_missing_accfile(runtmp, capfd): + acc_csv = runtmp.output('acc1.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + captured = capfd.readouterr() + print(captured.err) + assert "Error: No such file or directory" in captured.err + + +def test_urlsketch_empty_accfile(runtmp, capfd): + acc_csv = get_test_data('acc1.csv') + with open(acc_csv, 'w') as file: + file.write('') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + captured = capfd.readouterr() + print(captured.err) + assert 'Error: Invalid column names in CSV file. Columns should be: ["accession", "name", "moltype", "md5sum", "download_filename", "url"]' in captured.err + + +def test_urlsketch_bad_acc_fail(runtmp, capfd): + acc_csv = get_test_data('acc-url.csv') + acc_mod = runtmp.output('acc_mod.csv') + with open(acc_csv, 'r') as inF, open(acc_mod, 'w') as outF: + lines = inF.readlines() + outF.write(lines[0]) # write the header line + for line in lines: + # if this acc exist in line, copy it and write + if "GCA_000175535.1" in line: + mod_line = line.replace('GCA_000175535.1', 'GCA_0001755559.1') # add extra digit - should not be valid + print(mod_line) + outF.write(mod_line) + + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000") + + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + assert "Error: No signatures written, exiting." in captured.err + + + +def test_urlsketch_missing_output(runtmp): + # no output sig zipfile provided but also not --download-only + acc_csv = runtmp.output('acc1.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_csv, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000") + + assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err \ No newline at end of file From 9b0e53c44d0e45a81f4e91ebdcca381ccfb7a2b7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 May 2024 10:02:08 +0100 Subject: [PATCH 11/13] better failure testing --- tests/test_gbsketch.py | 20 +++++++-------- tests/test_urlsketch.py | 57 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 12 deletions(-) diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index 4a5d04e..3308e40 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -57,16 +57,16 @@ def test_gbsketch_simple(runtmp): assert sig.md5sum() == ss3.md5sum() assert os.path.exists(failed) with open(failed, 'r') as failF: - header = next(failF).strip() - assert header == "accession,name,moltype,md5sum,download_filename,url" - for line in failF: - print(line) - acc, name, moltype, md5sum, download_filename, url = line.strip().split(',') - assert acc == "GCA_000175535.1" - assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" - assert moltype == "protein" - assert download_filename == "GCA_000175535.1_protein.faa.gz" - assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + fail_lines = failF.readlines() + print(fail_lines) + assert len(fail_lines) == 2 + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" + acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" def test_gbsketch_simple_url(runtmp): diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index eaf846f..c7f0feb 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -31,7 +31,6 @@ def test_urlsketch_simple(runtmp): sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') ss1 = sourmash.load_one_signature(sig1, ksize=31) ss2 = sourmash.load_one_signature(sig2, ksize=31) - # why does this need ksize =30 and not ksize = 10!??? ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, @@ -68,6 +67,7 @@ def test_urlsketch_simple(runtmp): assert download_filename == "GCA_000175535.1_protein.faa.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + def test_urlsketch_save_fastas(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') @@ -266,4 +266,57 @@ def test_urlsketch_missing_output(runtmp): '--failed', failed, '-r', '1', '--param-str', "dna,k=31,scaled=1000") - assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err \ No newline at end of file + assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err + + +def test_urlsketch_from_gbsketch_failed(runtmp, capfd): + acc_csv = get_test_data('acc.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + assert os.path.exists(failed) + with open(failed, 'r') as failF: + fail_lines = failF.readlines() + assert len(fail_lines) == 2 + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" + acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + assert not runtmp.last_result.out # stdout should be empty + + out2 = runtmp.output('failed-retry.zip') + fail2 = runtmp.output('fail2.csv') + with pytest.raises(utils.SourmashCommandFailed): + + runtmp.sourmash('scripts', 'urlsketch', failed, '-o', out2, + '--failed', fail2, '-r', '1', + '-p', "protein,k=10,scaled=200") + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + assert "Error: No signatures written, exiting." in captured.err + + # since no protein file exists, fail2 should just be the same as failed + assert os.path.exists(fail2) + with open(fail2, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url = line.strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + # TODO: fix download_filename + # assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + + +# def test_urlsketch_from_urlsketch_failed(runtmp, capfd): From c328cc3da246e0861f8a5be90223b68209b8716d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 21 May 2024 06:52:41 +0100 Subject: [PATCH 12/13] fix dl filename --- src/directsketch.rs | 7 +++---- src/utils.rs | 12 +++--------- tests/test_urlsketch.py | 3 +-- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index 90790b8..597f799 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -411,13 +411,12 @@ async fn dl_sketch_url( { Some(data) => { // check keep_fastas instead?? - if let Some(download_filename) = download_filename { + if let Some(ref download_filename) = download_filename { let path = location.join(download_filename); fs::write(path, &data).context("Failed to write data to file")?; } if !download_only { - // let filename = download_filename.clone().unwrap(); - let filename = "".to_string(); + let filename = download_filename.clone().unwrap_or("".to_string()); // sketch data match moltype { InputMolType::Dna => sigs.extend( @@ -451,7 +450,7 @@ async fn dl_sketch_url( name: name.clone(), moltype: moltype.to_string(), md5sum: expected_md5.map(|x| x.to_string()), - download_filename: download_filename, + download_filename, url: Some(url), }; failed.push(failed_download); diff --git a/src/utils.rs b/src/utils.rs index d6618d5..ab91ac7 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -173,7 +173,6 @@ pub fn load_gbassembly_info(input_csv: String) -> Result<(Vec, u Ok((results, row_count)) } -#[allow(dead_code)] pub fn load_accession_info( input_csv: String, keep_fasta: bool, @@ -228,14 +227,9 @@ pub fn load_accession_info( .parse::() .map_err(|_| anyhow!("Invalid 'moltype' value"))?; let expected_md5sum = record.get(3).map(|s| s.to_string()); - let mut download_filename = None; - if keep_fasta { - download_filename = Some( - record - .get(4) - .ok_or_else(|| anyhow!("Missing 'download_filename' field"))? - .to_string(), - ); + let download_filename = record.get(4).map(|s| s.to_string()); + if keep_fasta && download_filename.is_none() { + return Err(anyhow!("Missing 'download_filename' field")); } let url = record .get(5) diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index c7f0feb..64d1f1f 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -314,8 +314,7 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd): assert acc == "GCA_000175535.1" assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" assert moltype == "protein" - # TODO: fix download_filename - # assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert download_filename == "GCA_000175535.1_protein.faa.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" From 1298edb87b1e20b94791e07122d022f2498f43ac Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 21 May 2024 07:54:54 +0100 Subject: [PATCH 13/13] add urlsketch docs --- README.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2c9edc4..df2ef53 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,16 @@ [![DOI](https://zenodo.org/badge/792101561.svg)](https://zenodo.org/doi/10.5281/zenodo.11165725) -tl;dr - download and sketch NCBI Assembly Datasets by accession +tl;dr - download and sketch data directly ## About -This plugin is an attempt to improve database generation by downloading assemblies, checking md5sum, and sketching to a sourmash zipfile. FASTA files can also be saved if desired. It's quite fast, but still very much at alpha level. Here be dragons. +Commands: + +- `gbsketch` - download and sketch NCBI Assembly Datasets by accession +- `urlsketch` - download and sketch directly from a url + +This plugin is an attempt to improve sourmash database generation by downloading files, checking md5sum if provided or accessible, and sketching to a sourmash zipfile. FASTA files can also be saved if desired. It's quite fast, but still very much at alpha level. Here be dragons. ## Installation @@ -16,7 +21,8 @@ This plugin is an attempt to improve database generation by downloading assembli pip install sourmash_plugin_directsketch ``` -## Usage +## `gbsketch` +download and sketch NCBI Assembly Datasets by accession ### Create an input file @@ -43,15 +49,13 @@ For reference: To run the test accession file at `tests/test-data/acc.csv`, run: ``` -sourmash scripts gbsketch tests/test-data/acc.csv -o test.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 +sourmash scripts gbsketch tests/test-data/acc.csv -o test-gbsketch.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 ``` Full Usage: ``` -usage: gbsketch [-h] [-q] [-d] -o OUTPUT [-f FASTAS] [-k] [--download-only] [--failed FAILED] [-p PARAM_STRING] [-c CORES] - [-r RETRY_TIMES] [-g | -m] - input_csv +usage: gbsketch [-h] [-q] [-d] [-o OUTPUT] [-f FASTAS] [-k] [--download-only] [--failed FAILED] [-p PARAM_STRING] [-c CORES] [-r RETRY_TIMES] [-g | -m] input_csv download and sketch GenBank assembly datasets @@ -66,7 +70,7 @@ options: output zip file for the signatures -f FASTAS, --fastas FASTAS Write fastas here - -k, --keep-fastas write FASTA files in addition to sketching. Default: do not write FASTA files + -k, --keep-fasta write FASTA files in addition to sketching. Default: do not write FASTA files --download-only just download genomes; do not sketch --failed FAILED csv of failed accessions and download links (should be mostly protein). -p PARAM_STRING, --param-string PARAM_STRING @@ -76,7 +80,61 @@ options: -r RETRY_TIMES, --retry-times RETRY_TIMES number of times to retry failed downloads -g, --genomes-only just download and sketch genome (DNA) files - -m, --proteomes-only just download and sketch proteome (protein) files +``` + +## `urlsketch` +download and sketch directly from a url +### Create an input file + +First, create a file, e.g. `acc-url.csv` with identifiers, sketch names, and other required info. +``` +accession,name,moltype,md5sum,download_filename,url +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz +``` +> Six columns must be present: +> - `accession` - an accession or unique identifier. Ideally no spaces. +> - `name` - full name for the sketch. +> - `moltype` - is the file 'dna' or 'protein'? +> - `md5sum` - expected md5sum (optional, will be checked after download if provided) +> - `download_filename` - filename for FASTA download. Required if `--keep-fastas`, but useful for signatures, too (saved in sig data). +> - `url` - direct link for the file + +### Run: + +To run the test accession file at `tests/test-data/acc-url.csv`, run: +``` +sourmash scripts urlsketch tests/test-data/acc-url.csv -o test-urlsketch.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 +``` + +Full Usage: +``` +usage: urlsketch [-h] [-q] [-d] [-o OUTPUT] [-f FASTAS] [-k] [--download-only] [--failed FAILED] [-p PARAM_STRING] [-c CORES] [-r RETRY_TIMES] input_csv + +download and sketch GenBank assembly datasets + +positional arguments: + input_csv a txt file or csv file containing accessions in the first column + +options: + -h, --help show this help message and exit + -q, --quiet suppress non-error output + -d, --debug provide debugging output + -o OUTPUT, --output OUTPUT + output zip file for the signatures + -f FASTAS, --fastas FASTAS + Write fastas here + -k, --keep-fasta, --keep-fastq + write FASTA/Q files in addition to sketching. Default: do not write FASTA files + --download-only just download genomes; do not sketch + --failed FAILED csv of failed accessions and download links (should be mostly protein). + -p PARAM_STRING, --param-string PARAM_STRING + parameter string for sketching (default: k=31,scaled=1000) + -c CORES, --cores CORES + number of cores to use (default is all available) + -r RETRY_TIMES, --retry-times RETRY_TIMES + number of times to retry failed downloads ``` ## Code of Conduct