Skip to content

Commit

Permalink
Merge pull request #31 from sourmash-bio/no-require-output-zip
Browse files Browse the repository at this point in the history
do not require signature output
  • Loading branch information
bluegenes authored May 13, 2024
2 parents d8cf235 + b252af7 commit ec560b0
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 84 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ If you choose to provide it, `ftp_path` must be the `ftp_path` column from NCBI'

For reference:

- example `ftp_path`: `https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/036/600/915/GCA_036600915.1_ASM3660091v1`
- bacteria assembly summary file: `https:ftp://ftp.ncbi.nih.gov/genomes/genbank/bacteria/assembly_summary.txt`
- example `ftp_path`: [https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/036/600/915/GCA_036600915.1_ASM3660091v1](https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/036/600/915/GCA_036600915.1_ASM3660091v1)
- bacteria assembly summary file: [https://ftp.ncbi.nih.gov/genomes/genbank/bacteria/assembly_summary.txt](https://ftp.ncbi.nih.gov/genomes/genbank/bacteria/assembly_summary.txt)

### Run:

Expand Down
148 changes: 77 additions & 71 deletions src/directsketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -422,82 +422,84 @@ async fn write_sig(

pub fn sigwriter_handle(
mut recv_sigs: tokio::sync::mpsc::Receiver<Vec<Signature>>,
output_sigs: String,
output_sigs: Option<String>,
error_sender: tokio::sync::mpsc::Sender<anyhow::Error>,
) -> tokio::task::JoinHandle<()> {
tokio::spawn(async move {
let mut md5sum_occurrences = HashMap::new();
let mut manifest_rows = Vec::new();
let mut wrote_sigs = false;
let outpath: PathBuf = output_sigs.into();
if let Some(outpath) = output_sigs {
let outpath: PathBuf = outpath.into();

let file = match File::create(&outpath).await {
Ok(file) => file,
Err(e) => {
let error =
anyhow::Error::new(e).context("Failed to create file at specified path");
let _ = error_sender.send(error).await; // Send the error through the channel
return; // Simply exit the task as error handling is managed elsewhere
}
};
let mut zip_writer = ZipFileWriter::with_tokio(file);

while let Some(sigs) = recv_sigs.recv().await {
for sig in sigs {
match write_sig(
&sig,
&mut md5sum_occurrences,
&mut manifest_rows,
&mut zip_writer,
)
.await
{
Ok(_) => wrote_sigs = true,
Err(e) => {
let error = e.context("Error processing signature");
if (error_sender.send(error).await).is_err() {
return; // Exit on failure to send error
let file = match File::create(&outpath).await {
Ok(file) => file,
Err(e) => {
let error =
anyhow::Error::new(e).context("Failed to create file at specified path");
let _ = error_sender.send(error).await; // Send the error through the channel
return; // Simply exit the task as error handling is managed elsewhere
}
};
let mut zip_writer = ZipFileWriter::with_tokio(file);

while let Some(sigs) = recv_sigs.recv().await {
for sig in sigs {
match write_sig(
&sig,
&mut md5sum_occurrences,
&mut manifest_rows,
&mut zip_writer,
)
.await
{
Ok(_) => wrote_sigs = true,
Err(e) => {
let error = e.context("Error processing signature");
if (error_sender.send(error).await).is_err() {
return; // Exit on failure to send error
}
}
}
}
}
}

if wrote_sigs {
println!("Writing manifest");
let manifest_filename = "SOURMASH-MANIFEST.csv".to_string();
let manifest: Manifest = manifest_rows.clone().into();
let mut manifest_buffer = Vec::new();
manifest
.to_writer(&mut manifest_buffer)
.expect("Failed to serialize manifest"); // Handle this more gracefully in production

let now = Utc::now();
let builder = ZipEntryBuilder::new(manifest_filename.into(), Compression::Stored)
.last_modification_date(ZipDateTime::from_chrono(&now));

if let Err(e) = zip_writer
.write_entry_whole(builder, &manifest_buffer)
.await
{
let error = anyhow::Error::new(e).context("Failed to write manifest to ZIP");
let _ = error_sender.send(error).await;
return;
}
if wrote_sigs {
println!("Writing manifest");
let manifest_filename = "SOURMASH-MANIFEST.csv".to_string();
let manifest: Manifest = manifest_rows.clone().into();
let mut manifest_buffer = Vec::new();
manifest
.to_writer(&mut manifest_buffer)
.expect("Failed to serialize manifest"); // Handle this more gracefully in production

let now = Utc::now();
let builder = ZipEntryBuilder::new(manifest_filename.into(), Compression::Stored)
.last_modification_date(ZipDateTime::from_chrono(&now));

if let Err(e) = zip_writer
.write_entry_whole(builder, &manifest_buffer)
.await
{
let error = anyhow::Error::new(e).context("Failed to write manifest to ZIP");
let _ = error_sender.send(error).await;
return;
}

if let Err(e) = zip_writer.close().await {
let error = anyhow::Error::new(e).context("Failed to close ZIP file");
let _ = error_sender.send(error).await;
return;
if let Err(e) = zip_writer.close().await {
let error = anyhow::Error::new(e).context("Failed to close ZIP file");
let _ = error_sender.send(error).await;
return;
}
} else {
let error = anyhow::Error::new(std::io::Error::new(
std::io::ErrorKind::Other,
"No signatures written",
));
let _ = error_sender.send(error).await; // Send error about no signatures written
}
} else {
let error = anyhow::Error::new(std::io::Error::new(
std::io::ErrorKind::Other,
"No signatures written",
));
let _ = error_sender.send(error).await; // Send error about no signatures written
}
drop(error_sender);
drop(error_sender); // should be dropped automatically as it goes out of scope, but just in case
})
}

Expand Down Expand Up @@ -569,7 +571,6 @@ pub fn error_handler(
pub async fn download_and_sketch(
py: Python,
input_csv: String,
output_sigs: String,
param_str: String,
failed_csv: String,
retry_times: u32,
Expand All @@ -578,13 +579,16 @@ pub async fn download_and_sketch(
genomes_only: bool,
proteomes_only: bool,
download_only: bool,
output_sigs: Option<String>,
) -> Result<(), anyhow::Error> {
// if sig output doesn't end in zip, bail
if Path::new(&output_sigs)
.extension()
.map_or(true, |ext| ext != "zip")
{
bail!("Output must be a zip file.");
// if sig output provided but doesn't end in zip, bail
if let Some(ref output_sigs) = output_sigs {
if Path::new(&output_sigs)
.extension()
.map_or(true, |ext| ext != "zip")
{
bail!("Output must be a zip file.");
}
}
// set up fasta download path
let download_path = PathBuf::from(fasta_location);
Expand Down Expand Up @@ -699,9 +703,11 @@ pub async fn download_and_sketch(
.await;
match result {
Ok((sigs, failed_downloads)) => {
if let Err(e) = send_sigs.send(sigs).await {
eprintln!("Failed to send signatures: {}", e);
let _ = send_errors.send(e.into()).await; // Send the error through the channel
if !sigs.is_empty() {
if let Err(e) = send_sigs.send(sigs).await {
eprintln!("Failed to send signatures: {}", e);
let _ = send_errors.send(e.into()).await; // Send the error through the channel
}
}
for fail in failed_downloads {
if let Err(e) = send_failed.send(fail).await {
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,17 @@ fn do_gbsketch(
input_csv: String,
param_str: String,
failed_csv: String,
output_sigs: String,
retry_times: u32,
fasta_location: String,
keep_fastas: bool,
genomes_only: bool,
proteomes_only: bool,
download_only: bool,
output_sigs: Option<String>,
) -> anyhow::Result<u8> {
match directsketch::download_and_sketch(
py,
input_csv,
output_sigs,
param_str,
failed_csv,
retry_times,
Expand All @@ -74,6 +73,7 @@ fn do_gbsketch(
genomes_only,
proteomes_only,
download_only,
output_sigs,
) {
Ok(_) => Ok(0),
Err(e) => {
Expand Down
17 changes: 12 additions & 5 deletions src/python/sourmash_plugin_directsketch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class Download_and_Sketch_Assemblies(CommandLinePlugin):
def __init__(self, p):
super().__init__(p)
p.add_argument('input_csv', help="a txt file or csv file containing accessions in the first column")
p.add_argument('-o', '--output', required=True,
p.add_argument('-o', '--output', default=None,
help='output zip file for the signatures')
p.add_argument('-f', '--fastas',
help='Write fastas here', default = '.')
Expand Down Expand Up @@ -67,7 +67,9 @@ def main(self, args):
if args.download_only and not args.keep_fastas:
notify("Error: '--download-only' requires '--keep-fastas'.")
sys.exit(-1)

if args.output is None and not args.download_only:
notify("Error: output signature zipfile is required if not using '--download-only'.")
sys.exit(-1)

# convert to a single string for easier rust handling
args.param_string = "_".join(args.param_string)
Expand All @@ -82,14 +84,19 @@ def main(self, args):
status = sourmash_plugin_directsketch.do_gbsketch(args.input_csv,
args.param_string,
args.failed,
args.output,
args.retry_times,
args.fastas,
args.keep_fastas,
args.genomes_only,
args.proteomes_only,
args.download_only)
args.download_only,
args.output)

if status == 0:
notify(f"...gbsketch is done! Sigs in '{args.output}'. Fastas in '{args.fastas}'.")
notify("...gbsketch is done!")
if args.output is not None:
notify(f"Sigs in '{args.output}'.")
if args.keep_fastas:
notify(f"FASTAs in '{args.fastas}'.")

return status
24 changes: 20 additions & 4 deletions tests/test_gbsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def test_gbsketch_save_fastas(runtmp):
else:
assert sig.md5sum() == ss3.md5sum()

def test_gbsketch_download_only(runtmp):
def test_gbsketch_download_only(runtmp, capfd):
acc_csv = get_test_data('acc.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')
Expand All @@ -264,14 +264,15 @@ def test_gbsketch_download_only(runtmp):
# why does this need ksize =30 and not ksize = 10!???
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')

runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--download-only',
runtmp.sourmash('scripts', 'gbsketch', acc_csv, '--download-only',
'--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fastas',
'--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200")

assert os.path.exists(output) # would be better if this didn't exist
assert not runtmp.last_result.out # stdout should be empty
fa_files = os.listdir(out_dir)
assert set(fa_files) == set(['GCA_000175535.1_genomic.fna.gz', 'GCA_000961135.2_protein.faa.gz', 'GCA_000961135.2_genomic.fna.gz'])
captured = capfd.readouterr()
assert "Failed to send signatures: channel closed" not in captured.err


def test_gbsketch_bad_acc(runtmp):
Expand Down Expand Up @@ -350,6 +351,7 @@ def test_gbsketch_missing_accfile(runtmp, capfd):
print(captured.err)
assert "Error: No such file or directory" in captured.err


def test_gbsketch_empty_accfile(runtmp, capfd):
acc_csv = get_test_data('acc1.csv')
with open(acc_csv, 'w') as file:
Expand Down Expand Up @@ -430,4 +432,18 @@ def test_gbsketch_cols_trailing_commas(runtmp, capfd):

captured = capfd.readouterr()
print(captured.err)
assert 'Error: CSV error: record 1 (line: 1, byte: 24): found record with 2 fields, but the previous record has 3 fields' in captured.err
assert 'Error: CSV error: record 1 (line: 1, byte: 24): found record with 2 fields, but the previous record has 3 fields' in captured.err


def test_gbsketch_missing_output(runtmp):
# no output sig zipfile provided but also not --download-only
acc_csv = runtmp.output('acc1.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'gbsketch', acc_csv,
'--failed', failed, '-r', '1',
'--param-str', "dna,k=31,scaled=1000")

assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err

0 comments on commit ec560b0

Please sign in to comment.