diff --git a/.gitignore b/.gitignore index 4fdf5b7..44f7dbe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ /target /test-data -/benchmarks/raw +/benchmarks_raw Cargo.lock diff --git a/Cargo.toml b/Cargo.toml index 693d7b3..fd8133a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,15 @@ [package] -name = "krakenxtract" -version = "0.3.0" +name = "kractor" +version = "0.4.0" edition = "2021" authors = ["Samuel Sims"] description = "Extract reads from a FASTQ file based on taxonomic classification via Kraken2." readme = "README.md" -repository = "https://github.com/Sam-Sims/krakenXtract" +repository = "https://github.com/Sam-Sims/kractor" license = "MIT" -keywords = ["kraken", "bioinformatics", "taxonomy", "fastq"] +keywords = ["kraken", "bioinformatics", "taxonomy", "fastq", "metagenomics"] categories = ["command-line-utilities", "science"] -documentation = "https://docs.rs/crate/krakenxtract/" +documentation = "https://docs.rs/crate/kractor/" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -18,8 +18,10 @@ chrono = "0.4.31" clap = { version = "4.3.19", features = ["derive"] } crossbeam = "0.8.2" env_logger = "0.10.0" +lazy_static = "1.4.0" log = "0.4.20" -niffler = { version = "2.5.0", features = ["gz_cloudflare_zlib"] } +niffler = { version = "2.5.0", default-features = false, features = ["gz", "bz2", "gz_cloudflare_zlib"] } noodles = { version = "0.51.0", features = ["fastq", "fasta"] } +serde_json = "1.0.107" diff --git a/README.md b/README.md index be07e7d..7c757b1 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,34 @@ -# krakenXtract - -[![Release](https://github.com/Sam-Sims/krakenXtract/actions/workflows/release.yaml/badge.svg)](https://github.com/Sam-Sims/krakenXtract/actions/workflows/release.yaml) -![GitHub release (with filter)](https://img.shields.io/github/v/release/sam-sims/krakenxtract) -![crates.io](https://img.shields.io/crates/v/krakenxtract +[![Release](https://github.com/Sam-Sims/Kractor/actions/workflows/release.yaml/badge.svg)](https://github.com/Sam-Sims/Kractor/actions/workflows/release.yaml) +![GitHub release (with filter)](https://img.shields.io/github/v/release/sam-sims/Kractor) +![crates.io](https://img.shields.io/crates/v/kractor ) -Extract reads from a FASTQ file based on taxonomic classification via Kraken2. - +# Kractor -## Motivation +**kra**ken extr**actor** -Heavily inspired by the great [KrakenTools](https://github.com/jenniferlu717/KrakenTools). +Kractor extracts sequencing reads based on taxonomic classifications obtained via [Kraken2](https://github.com/DerrickWood/kraken2). It consumes paired or unpaired `fastq[.gz/.bz]` files as input alongisde a Kraken2 standard output. It can optionally consume a Kraken2 report to extract all taxonomic parents and children of a given taxid. Fast by default, it outputs `fast[q/a]` files, that can optionally be compressed. -Having been wanting to experiment with Rust for a while, this is essentially an implementation of the `extract_kraken_reads.py` script, [re-implemented](https://www.reddit.com/media?url=https%3A%2F%2Fi.redd.it%2Fgood-for-you-crab-v0-5v9ygeh9r1c91.jpg%3Fs%3Dd759db5275e32c6e2bd5c22bddbd783acca46247) in Rust. +Kractor significantly enhances processing speed compared to KrakenTools for both paired and unpaired reads. Paired reads are processed approximately 21x quicker for compressed fastqs and 10x quicker for uncompressed. Unpaired reads are approximately 4x faster for both compressed and uncompressed inputs. -The main motivation was to provide a speedup when extracting a large number of reads from large FASTQ files - and to learn Rust! + For additional details, refer to the [benchmarks](benchmarks/benchmarks.md) -## Current features - -- Extract all reads from a `fastq` file based on a taxonomic id -- Extract all the parents or the children of the specified taxon id -- Supports single or paired-end `fastq` files -- Supports both uncompressed or `gzip` inputs and outputs. -- Multithreaded -- ~4.4x speed up compared to KrakenTools +## Motivation -## Benchmarks (WIP) +Heavily inspired by the great [KrakenTools](https://github.com/jenniferlu717/KrakenTools). -For more detail see [benchmarks](benchmarks/benchmarks.md) +At the time of writing KrakenTools operates as a single-threaded Python implementation which poses limitations in speed when processing large, paired-end fastq files. The main motivation was to enchance speed when parsing and extracting (writing) a large volume of reads - and also to learn rust! ## Installation -### Precompiled: -Github release: [0.3.0](https://github.com/Sam-Sims/krakenXtract/releases/tag/v0.3.0) +### Binaries: + +Precompiled binaries for Linux, MacOS and Windows are attached to the latest release [0.4.0](https://github.com/Sam-Sims/Kractor/releases/tag/v0.4.0) ### Cargo: Requires [cargo](https://www.rust-lang.org/tools/install) ``` -cargo install krakenxtract +cargo install kractor ``` ### Build from source: @@ -49,33 +40,33 @@ To install please refer to the rust documentation: [docs](https://www.rust-lang. #### Clone the repository: ```bash -git clone https://github.com/Sam-Sims/krakenxtract +git clone https://github.com/Sam-Sims/Kractor ``` #### Build and add to path: ```bash -cd kraken-extract +cd Kractor cargo build --release export PATH=$PATH:$(pwd)/target/release ``` -All executables will be in the directory kraken-extract/target/release. +All executables will be in the directory Kractor/target/release. ## Usage - +![Alt text](screenshot.png) ### Basic Usage: ```bash -krakenXtract -k -i -t -o +kractor -k -i -t -o > kractor_report.json ``` Or, if you have paired-end illumina reads: ```bash -krakenXtract -k -i -i -t -o -o +kractor -k -i -i -t -o -o ``` If you want to extract all children of a taxon: ```bash -krakenXtract -k -r -i -t --children -o +kractor -k -r -i -t --children -o ``` ### Arguments: @@ -86,7 +77,7 @@ krakenXtract -k -r -i -t -i ` @@ -100,7 +91,7 @@ Using `--input` once but passing both files: `-i This option will specify the output files containing the extracted reads. The order of the output files is assumed to be the same as the input. -By default the compression will be inferred from the output file extension for supported file types (`gzip`, `bzip`, `lzma` and `zstd`). If the output type cannot be inferred, plaintext will be output. +By default the compression will be inferred from the output file extension for supported file types (`gz`, `bz`). If the output type cannot be inferred, plaintext will be output. #### Kraken Output @@ -124,17 +115,15 @@ This option will manually set the compression mode used for the output file and Valid values are: -- `gz` to output gzip -- `bz` to output bzip -- `lzma` to output lzma -- `zstd` to output zstd +- `gz` to output gz +- `bz2` to output bz2 - `none` to not apply compresison #### Compression level `-l, --level` -This option will set the compression level to use if compressing the output. Should be a value between 1-9 with 1 being the fastest but largest file size and 9 is for slowest, but best file size. By default this is set at 6, but for the highest speeds 2 is a good trade off for speed/filesize. +This option will set the compression level to use if compressing the output. Should be a value between 1-9 with 1 being the fastest but largest file size and 9 is for slowest, but best file size. By default this is set at 2 as it is a good trade off for speed/filesize. #### Output fasta @@ -166,6 +155,12 @@ This will extract all the reads classified as decendents or subtaxa of `--taxid` This will output every read except those matching the taxid. Works with `--parents` and `--children` +#### Skip report + +`--no-json` + +This will skip the json report that is output to stdout upon programme completion. + ## Future plans - [x] Support unzipped fastq files @@ -173,20 +168,23 @@ This will output every read except those matching the taxid. Works with `--paren - [x] `--include-parents` and `--include-children` arguments - [ ] Supply multiple taxonomic IDs to extract - [x] Exclude taxonomic IDs -- [ ] `--append` - [x] `--compression-mode` - [x] More verbose output -- [ ] Proper benchmarks -- [x] Output fasta format (for blast??) +- [x] Benchmarks +- [x] Output fasta - [x] Output non `gz` - [ ] Tests ## Version -- 0.3.0 +- 0.4.0 ## Changelog +### 0.4.0 +- Json report including in stdout upon successful completion (can be disabled with --no-json) +- Renamed + ### 0.3.0 - Support for paired-end files diff --git a/benchmarks/benchmarks.md b/benchmarks/benchmarks.md index 3b18693..f1efb29 100644 --- a/benchmarks/benchmarks.md +++ b/benchmarks/benchmarks.md @@ -1,67 +1,114 @@ # Benchmarks: -Benchmarks were run in triplicate with 1 warmup run - using [hyperfine](https://github.com/sharkdp/hyperfine). +Benchmarks were run 5x each with 1 warmup run - using [hyperfine](https://github.com/sharkdp/hyperfine). -They were run on my own PC running Ubuntu in WSL2 (Plans to test in native linux): +They were run on my own PC running Ubuntu 23.04: - i7 12700k - 12 cores -- 32GB memory with 16GB Assigned to WSL2 +- 32GB memory - M.2 SSD drive Hyperfine command: ```bash -hyperfine --warmup 1 -r 3 --export-json --export-markdown +hyperfine --warmup 1 -r 5 --export-json --export-markdown ``` +## Unpaired -## Benchmark 1: -ONT long read sequencing, outputting a plain (non-compressed) fastq file +Single tests were based on the [Zymo mock community](https://github.com/LomanLab/mockcommunity) Zymo-GridION-EVEN-BB-SN dataset. + +### Benchmark 1: +ONT long read sequencing, input `fastq` format, outputting a non-compressed `fastq` file. Extracting all reads classified as *Bacillus spizizenii*. *Inputs:* | File type | Platform | Total reads | Reads to extract | Kraken output size| Output| |-----------|----------|-------------|------------------|-------------------|-------| -|`.fastq.gz`|ONT |5,454,495 |3,489,386 |1.8GB |`.fastq` +|`.fastq` |ONT |3,491,078 |490,984 |885MB |`.fastq` *Commands run:* | Tool | Command | |------|---------| -| krakenXtract | `./target/release/krakenxtract --kraken test-data/output.kraken --fastq test-data/ont_test.fastq.gz --taxid 666 --output krakenxtract_benchmark.fq --no-compress` | -| KrakenTools | `extract_kraken_reads.py -k test-data/output.kraken -s test-data/ont_test.fastq.gz -o krakentools_benchmark.fq --fastq-output -t 666` | +| `KrakenTools` | `extract_kraken_reads.py -s Zymo-GridION-EVEN-BB-SN.fq -k out.kraken -o krakentools.fq -t 96241 --fastq-output` | +| `Kractor` | `kractor -i Zymo-GridION-EVEN-BB-SN.fq -k out.kraken -o kractor.fq -t 96241` | + *Results:* | Tool | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| `krakenXtract` | 80.585 ± 9.795 | 72.964 | 91.633 | 1.00 | -| `KrakenTools` | 354.722 ± 8.613 | 349.398 | 364.659 | 4.40 ± 0.55 | +| `krakenTools` | 254.881 ± 9.482 | 242.553 | 263.158 | 1.00 | +| `Kractor` | 58.253 ± 5.651 | 48.358 | 62.222 | 4.38 ± 0.45 | + +### Benchmark 2: +ONT long read sequencing, input `fastq.gz` format, outputting a non-compressed `fastq` file. Extracting all reads classified as *Bacillus spizizenii*. + + +*Inputs:* + +| File type | Platform | Total reads | Reads to extract | Kraken output size| Output| +|-----------|----------|-------------|------------------|-------------------|-------| +|`.fastq.gz`|ONT |3,491,078 |490,984 |885MB |`.fastq` + +*Commands run:* -Welch t-test: t = -36.4, p = 4e-06 +| Tool | Command | +|------|---------| +| KrakenTools | `extract_kraken_reads.py -s Zymo-GridION-EVEN-BB-SN.fq.gz -k out.kraken -o krakentools.fq -t 96241 --fastq-output` | +| Kractor | `kractor -i Zymo-GridION-EVEN-BB-SN.fq.gz -k out.kraken -o kractor.fq -t 96241` | -There is a difference between the two benchmarks (p < 0.05). -![Alt text](img/ont-benchmark-whisker.png) +*Results:* +| Command | Mean [s] | Min [s] | Max [s] | Relative | +|:---|---:|---:|---:|---:| +| `krakenTools` | 376.592 ± 3.905 | 373.343 | 383.315 | 1.00 | +| `Kractor` | 100.044 ± 3.599 | 98.044 | 106.449 | 3.76 ± 0.14 | +## Paired +Paired end tests were based on SRA accession: SRR19995508 -## Benchmark 2: -ONT long read sequencing, outputting a compressed fastq file. +### Benchmark 3: +Illumina paired end sequencing, input `fastq` format, outputting a non-compressed `fastq` file. + +*Inputs:* + +| File type | Platform | Total reads | Reads to extract | Kraken output size| Output| +|-----------|----------|-------------|------------------|-------------------|-------| +|`.fastq`|Illumina paired |53,526,611 |1,646,117 |9.3GB |`.fastq` + +*Commands run:* + +| Tool | Command | +|------|---------| +| KrakenTools | `extract_kraken_reads.py -s SRR19995508_R1.fastq -s2 SRR19995508_R2.fastq -o R1_tools.fq -o2 R2_tools.fq -k out.kraken -t 590 --fastq-output` | +| Kractor | `kractor -i SRR19995508_R1.fastq -i SRR19995508_R2.fastq -k out.kraken -t 590 -o R1_kractor.fq -o R2_kractor.fq` | + + +*Results:* +| Command | Mean [s] | Min [s] | Max [s] | Relative | +|:---|---:|---:|---:|---:| +| `krakenTools` | 898.306 ± 14.203 | 884.229 | 920.653 | 1.00 | +| `Kractor` | 94.198 ± 2.317 | 90.852 | 96.474 | 9.54 ± 0.28 | -NOTE: Kraken Tools has no option to output a compressed fastq. +### Benchmark 4: +Illumina paired end sequencing, input `fastq.gz` format, outputting a non-compressed `fastq` file. *Inputs:* | File type | Platform | Total reads | Reads to extract | Kraken output size| Output| |-----------|----------|-------------|------------------|-------------------|-------| -|`.fastq.gz`|ONT |5,454,495 |3,489,386 |1.8GB |`.fastq.gz` +|`.fastq.gz`|Illumina paired |53,526,611 |1,646,117 |9.3GB |`.fastq` *Commands run:* | Tool | Command | |------|---------| -| krakenXtract | `./target/release/krakenxtract --kraken test-data/output.kraken --fastq test-data/ont_test.fastq.gz --taxid 666 --output krakenxtract_benchmark.fq.gz` | +| KrakenTools | `extract_kraken_reads.py -s SRR19995508_R1.fastq.gz -s2 SRR19995508_R2.fastq.gz -o R1_tools.fq -o2 R2_tools.fq -k out.kraken -t 2 --fastq-output` | +| Kractor | `kractor -i SRR19995508_R1.fastq.gz -i SRR19995508_R2.fastq.gz -k out.kraken -t 2 -o R1_kractor.fq -o R2_kractor.fq` | *Results:* | Command | Mean [s] | Min [s] | Max [s] | Relative | |:---|---:|---:|---:|---:| -| krakenXtract | 78.042 ± 1.622 | 76.361 | 79.597 | 1.00 | +| `krakenTools` | 1033.379 ± 25.238 | 1005.720 | 1068.522 | 1.00 | +| `Kractor` | 49.071 ± 0.179 | 48.857 | 49.334 | 21.06 ± 0.52 | \ No newline at end of file diff --git a/screenshot.png b/screenshot.png new file mode 100644 index 0000000..bee3083 Binary files /dev/null and b/screenshot.png differ diff --git a/src/cli.rs b/src/cli.rs index 01198a9..0337500 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,5 +1,5 @@ use clap::Parser; -use log::{debug, error, info, trace, warn}; +use log::error; #[derive(Parser, Debug)] #[command( @@ -38,7 +38,7 @@ pub struct Cli { #[arg( short = 'l', long = "level", - default_value = "6", + default_value = "2", value_parser(validate_compression_level) )] pub compression_level: niffler::Level, @@ -54,6 +54,9 @@ pub struct Cli { // Output reads in FASTA format #[arg(long, action)] pub output_fasta: bool, + // Dont output json + #[arg(long = "no-json")] + pub no_json: bool, // Verbose #[arg(short)] pub verbose: bool, @@ -85,9 +88,7 @@ impl Cli { fn validate_compression(s: &str) -> Result { match s { "gz" => Ok(niffler::compression::Format::Gzip), - "bz" => Ok(niffler::compression::Format::Bzip), - "lzma" => Ok(niffler::compression::Format::Lzma), - "zst" => Ok(niffler::compression::Format::Zstd), + "bz2" => Ok(niffler::compression::Format::Bzip), "none" => Ok(niffler::compression::Format::No), _ => Err(format!("Unknown compression type: {}", s)), } diff --git a/src/main.rs b/src/main.rs index dcca566..d39252c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,7 @@ use chrono::Local; use clap::Parser; use crossbeam::channel::{self, Receiver, Sender}; use env_logger::{fmt::Color, Builder}; -use log::{debug, error, info, trace, warn, LevelFilter}; +use log::{debug, error, info, trace, LevelFilter}; use noodles::{ fasta::{ self, @@ -12,14 +12,15 @@ use noodles::{ fastq, }; use std::{ - collections::HashMap, + collections::{HashMap, HashSet}, fs::{self}, io::{self, prelude::*, BufReader}, path::Path, - sync::Arc, + sync::{Arc, Mutex}, thread, time::{Duration, Instant}, }; +use lazy_static::lazy_static; mod cli; @@ -42,52 +43,12 @@ impl Tree { } } -/// Reads a FASTQ file from the specified path and returns a buffered reader. -/// -/// This function reads a FASTQ file from the given path and returns a buffered reader -/// It automatically handles both plain text and gzipped files based on the file's magic number. -/// -/// # Arguments -/// -/// * `path` - A string containing the path to the FASTQ file. -/// -/// # Returns -/// -/// A buffered reader containing the contents of the FASTQ file. The reader may be a -/// plain text reader or a gzip decompressor, depending on the file format. -// fn read_fastq(path: &str) -> BufReader> { -// // The gzip magic number is 0x1f8b -// const GZIP_MAGIC_NUMBER: [u8; 2] = [0x1f, 0x8b]; - -// let mut file = match fs::File::open(path) { -// Ok(file) => file, -// Err(_) => { -// error!("Error opening fastq file"); -// std::process::exit(1); -// } -// }; - -// // Buffer to store the first two bytes of the file -// let mut buffer = [0u8; 2]; - -// if let Ok(size) = file.read(&mut buffer) { -// // reset the pointer back to the start -// file.seek(io::SeekFrom::Start(0)).ok(); -// // check if the first two bytes match the gzip magic number => file is gzipped -// // otherwise, it's a plain text file -// if size == 2 && buffer == GZIP_MAGIC_NUMBER { -// trace!("Detected gzipped file"); -// let gzip_reader: Box = Box::new(GzDecoder::new(file)); -// io::BufReader::new(gzip_reader) -// } else { -// trace!("Detected plain text file"); -// let plain_reader: Box = Box::new(file); -// io::BufReader::new(plain_reader) -// } -// } else { -// panic!("Error reading from the file"); -// } -// } +lazy_static! { + static ref TAXON_ID_COUNT: Arc> = Arc::new(Mutex::new(0)); + static ref TAXON_IDS: Arc>> = Arc::new(Mutex::new(Vec::new())); + static ref TOTAL_READS: Arc> = Arc::new(Mutex::new(0)); + static ref READS_TO_EXTRACT: Arc> = Arc::new(Mutex::new(0)); +} /// Parses a Kraken output line to extract taxon ID and read ID. /// @@ -126,10 +87,9 @@ fn process_kraken_output( kraken_path: String, exclude: bool, taxon_ids_to_save: Vec, -) -> Arc> { +) -> Arc> { info!("Processing kraken output..."); - let mut reads_to_save = HashMap::new(); - //let kraken_file = fs::File::open(kraken_path).expect("Error reading kraken output file"); + let mut reads_to_save = HashSet::new(); let kraken_file = match fs::File::open(kraken_path) { Ok(kraken_file) => kraken_file, Err(_) => { @@ -140,26 +100,29 @@ fn process_kraken_output( let reader = io::BufReader::new(kraken_file); let mut total_reads = 0; - io::stdout().flush().unwrap(); for line_result in reader.lines() { let line = line_result.expect("Error reading kraken output line"); let (taxon_id, read_id) = process_kraken_output_line(&line); if exclude { if !taxon_ids_to_save.contains(&taxon_id) { - reads_to_save.insert(read_id.clone(), taxon_id); + reads_to_save.insert(read_id); } } else if taxon_ids_to_save.contains(&taxon_id) { - reads_to_save.insert(read_id.clone(), taxon_id); + reads_to_save.insert(read_id); } total_reads += 1; } - info!("{} taxon IDs identified", taxon_ids_to_save.len()); - info!( - "{} total reads | {} reads to extract.", - total_reads, - reads_to_save.len() - ); - let reads_to_save: Arc> = Arc::new(reads_to_save); + let mut taxon_id_count = TAXON_ID_COUNT.lock().unwrap(); + let mut taxon_ids = TAXON_IDS.lock().unwrap(); + let mut total_read_count = TOTAL_READS.lock().unwrap(); + let mut reads_to_extract = READS_TO_EXTRACT.lock().unwrap(); + + *taxon_id_count = taxon_ids_to_save.len(); + *taxon_ids = taxon_ids_to_save; + *total_read_count = total_reads; // Update this with the actual total_reads value + *reads_to_extract = reads_to_save.len(); + + let reads_to_save: Arc> = Arc::new(reads_to_save); reads_to_save } @@ -277,7 +240,7 @@ fn build_tree_from_kraken_report( }; { - let reader = io::BufReader::new(report_file); + let reader = BufReader::new(report_file); let mut prev_index = None; for line in reader.lines().flatten() { @@ -364,54 +327,6 @@ fn collect_taxons_to_save(args: &Cli) -> Vec { taxon_ids_to_save } -// fn parse_fastq( -// in_buf: io::BufReader>, -// tx: &Sender>, -// reads_to_save: Arc>, -// output_fasta: bool, -// ) { -// let mut num_lines = 0; -// let mut num_reads = 0; -// let mut current_id: String = String::new(); -// //let mut stdout = BufWriter::new(io::stdout().lock()); -// let mut line_bytes = Vec::new(); -// let mut last_progress_update = Instant::now(); // For throttling progress updates -// const PROGRESS_UPDATE_INTERVAL: Duration = Duration::from_millis(1500); - -// for line in in_buf.lines() { -// let line = line.expect("Error reading fastq line"); -// line_bytes.clear(); -// line_bytes.extend_from_slice(line.as_bytes()); -// num_lines += 1; - -// if num_lines % 4 == 1 { -// let fields: Vec<&str> = line.split_whitespace().collect(); -// let read_id = &fields[0][1..]; -// current_id = read_id.to_string(); -// num_reads += 1; -// } -// if reads_to_save.contains_key(¤t_id) { -// if output_fasta { -// match num_lines % 4 { -// 1 => { -// let fasta_header = format!("> {}", current_id); -// tx.send(fasta_header.as_bytes().to_vec()).unwrap(); -// } -// 2 => tx.send(line_bytes.to_vec()).unwrap(), -// _ => (), -// } -// } else { -// tx.send(line_bytes.to_vec()).unwrap(); -// } -// } -// // Throttle progress updates - need to fix for multi-threading -// if last_progress_update.elapsed() >= PROGRESS_UPDATE_INTERVAL { -// trace!("Processed {} reads", num_reads); -// last_progress_update = Instant::now(); -// //io::stdout().flush().unwrap(); -// } -// } -// } /// Parse a FASTQ file and send reads to writer thread. /// @@ -424,11 +339,7 @@ fn collect_taxons_to_save(args: &Cli) -> Vec { /// * `file_path` - A string containing the path to the FASTQ file. /// * `reads_to_save` - A HashMap containing read IDs and their associated taxon IDs. /// * `tx` - A Sender channel to send the parsed reads to the writer thread. -fn parse_fastq( - file_path: &str, - reads_to_save: Arc>, - tx: &Sender, -) { +fn parse_fastq(file_path: &str, reads_to_save: Arc>, tx: &Sender) { let mut num_reads = 0; let mut last_progress_update = Instant::now(); @@ -453,7 +364,7 @@ fn parse_fastq( let record = result.unwrap(); let read_name_bytes = record.name(); let read_name_string = std::str::from_utf8(read_name_bytes); - if reads_to_save.contains_key(&read_name_string.unwrap().to_string()) { + if reads_to_save.contains(&read_name_string.unwrap().to_string()) { tx.send(record).unwrap(); } num_reads += 1; @@ -466,31 +377,6 @@ fn parse_fastq( } } -// fn write_output_file( -// out_file: fs::File, -// rx: Receiver>, -// compression_mode: Compression, -// no_compress: bool, -// output_fasta: bool, -// ) { -// let mut out_buf: Box = if no_compress || output_fasta { -// Box::new(io::BufWriter::new(out_file)) -// } else { -// Box::new(io::BufWriter::new(GzEncoder::new( -// out_file, -// compression_mode, -// ))) -// }; - -// for data in rx { -// out_buf -// .write_all(&data) -// .and_then(|_| out_buf.write_all(b"\n")) -// .expect("Error writing to output file"); -// } -// out_buf.flush().expect("Error flushing output buffer"); -// } - /// Infer the compression format from a file path based on its extension. /// /// This function takes a file path as input and checks its file extension to determine @@ -510,8 +396,6 @@ fn infer_compression(file_path: &String) -> niffler::compression::Format { match ext { "gz" => niffler::compression::Format::Gzip, "bz2" => niffler::compression::Format::Bzip, - "lzma" => niffler::compression::Format::Lzma, - "zst" => niffler::compression::Format::Zstd, _ => niffler::compression::Format::No, } } @@ -617,7 +501,7 @@ fn write_output_fasta(rx: Receiver, out_file: String) { /// * `compression_level` - The compression level to use for the output file. /// * `fasta` - A boolean indicating whether the output should be in FASTA format. fn process_single_end( - reads_to_save: Arc>, + reads_to_save: Arc>, input: Vec, output: Vec, compression_type: Option, @@ -645,7 +529,7 @@ fn process_single_end( } }); reader_thread.join().unwrap(); - info!("Processing is done. Writing is in progress..."); + info!("Processing complete. Writing is in progress..."); writer_thread.join().unwrap(); info!("Writing complete."); } @@ -665,7 +549,7 @@ fn process_single_end( /// * `compression_level` - The compression level to use for the output files. /// * `fasta` - A boolean indicating whether to output in FASTA format. fn process_paired_end( - reads_to_save: Arc>, + reads_to_save: Arc>, input: Vec, output: Vec, compression_type: Option, @@ -719,7 +603,6 @@ fn process_paired_end( info!("Processing is done. Writing is in progress..."); writer_thread1.join().unwrap(); writer_thread2.join().unwrap(); - info!("Writing complete."); } /// Initializes and configures the logger. @@ -751,7 +634,7 @@ fn logger(verbose: bool) { writeln!( buf, "{} [{}] - {}", - Local::now().format("[%Y-%m-%d][%H:%M:%S]"), + Local::now().format("[%H:%M:%S]"), style.value(record.level()), record.args() ) @@ -796,4 +679,18 @@ fn main() { args.output_fasta, ); } + info!("Complete!"); + + if !args.no_json { + let stats = serde_json::json!({ + "taxon_id_count": *TAXON_ID_COUNT.lock().unwrap(), + "taxon_ids": *TAXON_IDS.lock().unwrap(), + "reads_in": *TOTAL_READS.lock().unwrap(), + "reads_out": *READS_TO_EXTRACT.lock().unwrap(), + "input_format": if paired { "paired-end" } else { "single-end" }, + "output_format": if args.output_fasta { "fasta" } else { "fastq" }, + }); + let stats_json = serde_json::to_string_pretty(&stats).unwrap(); + println!("{}", stats_json); + } }