From e8d598d86cf22964ba6220d31b8e22c6544869e4 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Fri, 1 Mar 2024 12:49:57 -0800 Subject: [PATCH 01/15] Replace anonymous tuples in JSON parsing with structs. --- enclone_args/src/read_json.rs | 164 ++++++++++++++-------------------- 1 file changed, 65 insertions(+), 99 deletions(-) diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index b5a376765..6e47c3451 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -676,7 +676,17 @@ fn parse_vector_entry_from_json( // this code. Another would be to write out a binary version of the JSON file that contains // only the information that we need. -pub fn read_json( +#[derive(Default)] +struct ReadJsonResult { + vdj_cells: Vec, + gex_cells: Vec, + gex_cells_specified: bool, + cr_version: String, + tigs: Vec, + err: String, +} + +fn read_json( accept_inconsistent: bool, origin_info: &OriginInfo, li: usize, @@ -731,29 +741,13 @@ pub fn read_json( } } } - let mut results = Vec::<( - usize, - Vec, - Vec, - bool, - String, - Vec, - String, - )>::new(); + let mut results = Vec::<(usize, ReadJsonResult)>::new(); for i in 0..xs.len() { - results.push(( - i, - Vec::::new(), - Vec::::new(), - false, - String::new(), - Vec::::new(), - String::new(), - )); + results.push((i, Default::default())); } let exiting = AtomicBool::new(false); - results.par_iter_mut().for_each(|res| { - let i = res.0; + results.par_iter_mut().for_each(|(i, res)| { + let i = *i; let resx = parse_vector_entry_from_json( &xs[i], json, @@ -764,32 +758,32 @@ pub fn read_json( to_ref_index, reannotate, ctl, - &mut res.1, - &mut res.2, - &mut res.3, - &mut res.4, - &mut res.5, + &mut res.vdj_cells, + &mut res.gex_cells, + &mut res.gex_cells_specified, + &mut res.cr_version, + &mut res.tigs, &exiting, ); if let Err(resx) = resx { - res.6 = resx; + res.err = resx; } }); - for result in &results { - if !result.6.is_empty() { - return Err(result.6.clone()); + for (_, result) in &results { + if !result.err.is_empty() { + return Err(result.err.clone()); } } - for result in results.iter_mut().take(xs.len()) { - vdj_cells.append(&mut result.1); - gex_cells.append(&mut result.2); - if result.3 { + for (_, result) in results.iter_mut().take(xs.len()) { + vdj_cells.append(&mut result.vdj_cells); + gex_cells.append(&mut result.gex_cells); + if result.gex_cells_specified { *gex_cells_specified = true; } - if !result.4.is_empty() { - *cr_version = result.4.clone(); + if !result.cr_version.is_empty() { + *cr_version = result.cr_version.clone(); } - tigs.append(&mut result.5); + tigs.append(&mut result.tigs); } unique_sort(gex_cells); let mut tig_bc = Vec::>::new(); @@ -849,6 +843,15 @@ pub fn read_json( // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ // Parse the JSON annotations file(s). +#[derive(Default)] +struct LoadResult { + tig_bc: Vec>, + cr_version: String, + vdj_cells: Vec, + gex_cells: Vec, + gex_cells_specified: bool, + err: String, +} pub fn parse_json_annotations_files( ctl: &EncloneControl, @@ -860,30 +863,9 @@ pub fn parse_json_annotations_files( gex_cells_specified: &mut Vec, fate: &mut [HashMap], ) -> Result<(), String> { - // (origin index, contig name, V..J length): (?) - let mut results = Vec::<( - usize, - Vec<(String, usize)>, - Vec>, - Vec>, // logs - String, - Vec, - Vec, - bool, - String, - )>::new(); + let mut results = Vec::<(usize, LoadResult)>::new(); for i in 0..ctl.origin_info.dataset_path.len() { - results.push(( - i, - Vec::<(String, usize)>::new(), - Vec::>::new(), - Vec::>::new(), - String::new(), - Vec::::new(), - Vec::::new(), - false, - String::new(), - )); + results.push((i, Default::default())); } // Note: only tracking truncated seq and quals initially let ann = if !ctl.gen_opt.cellranger { @@ -891,12 +873,12 @@ pub fn parse_json_annotations_files( } else { "contig_annotations.json" }; - results.par_iter_mut().for_each(|res| { - let li = res.0; + results.par_iter_mut().for_each(|(li, res)| { + let li = *li; let json = format!("{}/{ann}", ctl.origin_info.dataset_path[li]); let json_lz4 = format!("{}/{ann}.lz4", ctl.origin_info.dataset_path[li]); if !path_exists(&json) && !path_exists(&json_lz4) { - res.8 = format!("\ncan't find {json} or {json_lz4}\n"); + res.err = format!("\ncan't find {json} or {json_lz4}\n"); return; } let resx = read_json( @@ -907,41 +889,37 @@ pub fn parse_json_annotations_files( refdata, to_ref_index, ctl.gen_opt.reannotate, - &mut res.4, + &mut res.cr_version, ctl, - &mut res.5, - &mut res.6, - &mut res.7, + &mut res.vdj_cells, + &mut res.gex_cells, + &mut res.gex_cells_specified, ); if let Ok(resx) = resx { let tig_bc: Vec> = resx; - res.5.sort(); - res.2 = tig_bc; + res.vdj_cells.sort(); + res.tig_bc = tig_bc; } else { - res.8 = resx.err().unwrap(); + res.err = resx.err().unwrap(); } }); - for result in &results { - if !result.8.is_empty() { - return Err(result.8.clone()); + for (_, result) in &results { + if !result.err.is_empty() { + return Err(result.err.clone()); } } let mut versions = Vec::::new(); - for i in 0..results.len() { - tig_bc.append(&mut results[i].2.clone()); - // ctl.gen_opt.cr_version = results[i].4.clone(); - if results[i].4.is_empty() { + for (i, mut result) in results { + tig_bc.append(&mut result.tig_bc); + if result.cr_version.is_empty() { versions.push("≤3.1".to_string()); } else { - versions.push(results[i].4.clone()); + versions.push(result.cr_version); } - vdj_cells.push(results[i].5.clone()); - gex_cells.push(results[i].6.clone()); - gex_cells_specified.push(results[i].7); - let cells = &results[i].5; + let cells = &result.vdj_cells; let mut found = vec![false; cells.len()]; - let tigs = &results[i].2; + let tigs = result.tig_bc; for tig in tigs { let p = bin_position(cells, &tig[0].barcode); if p >= 0 { @@ -953,22 +931,10 @@ pub fn parse_json_annotations_files( fate[i].insert(cells[j].clone(), BarcodeFate::NonProductive); } } + + vdj_cells.push(result.vdj_cells); + gex_cells.push(result.gex_cells); + gex_cells_specified.push(result.gex_cells_specified); } - /* - if !ctl.gen_opt.internal_run { - unique_sort(&mut versions); - if versions.len() > 1 - && versions != vec!["4.0".to_string(), "4009.52.0-82-g2244c685a".to_string()] - { - let args: Vec = env::args().collect(); - return Err(format!( - "\nYou're using output from multiple Cell Ranger versions = {},\n\ - which is not allowed. Your command was:\n{}\n", - versions.iter().format(", "), - args.iter().format(","), - )); - } - } - */ Ok(()) } From 57def9be8232834e6879c094bdace9465ffce129 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Fri, 1 Mar 2024 14:23:55 -0800 Subject: [PATCH 02/15] Invert data control flow in json reading. --- enclone_args/src/read_json.rs | 273 ++++++++++++++++------------------ 1 file changed, 127 insertions(+), 146 deletions(-) diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index 6e47c3451..eaf595c89 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -29,6 +29,7 @@ use debruijn::dna_string::DnaString; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{EncloneControl, OriginInfo, TigData}; use io_utils::{open_maybe_compressed, path_exists, read_vector_entry_from_json}; +use itertools::Itertools; use rand::Rng; use rayon::prelude::*; use serde_json::Value; @@ -93,6 +94,15 @@ fn json_error( // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ +#[derive(Default)] +struct JsonParseResult { + vdj_cell: Option, + gex_cell: Option, + gex_cells_specified: bool, + cr_version: String, + tig: Option, +} + fn parse_vector_entry_from_json( x: &[u8], json: &str, @@ -103,13 +113,9 @@ fn parse_vector_entry_from_json( to_ref_index: &HashMap, reannotate: bool, ctl: &EncloneControl, - vdj_cells: &mut Vec, - gex_cells: &mut Vec, - gex_cells_specified: &mut bool, - cr_version: &mut String, - tigs: &mut Vec, exiting: &AtomicBool, -) -> Result<(), String> { +) -> Result { + let mut res: JsonParseResult = Default::default(); let v: Value = match serde_json::from_slice(x) { Err(_) => { return Err(format!( @@ -132,29 +138,28 @@ fn parse_vector_entry_from_json( is_cell = true; } - let is_gex_cell = v["is_gex_cell"].as_bool(); - if is_gex_cell.is_some() { - *gex_cells_specified = true; - } - if is_gex_cell == Some(true) { - gex_cells.push(barcode.clone()); + if let Some(is_gex_cell) = v["is_gex_cell"].as_bool() { + res.gex_cells_specified = true; + if is_gex_cell { + res.gex_cell = Some(barcode.clone()); + } } if !ctl.gen_opt.ncell && !is_cell { - return Ok(()); + return Ok(res); } if is_cell { - vdj_cells.push(barcode.clone()); + res.vdj_cell = Some(barcode.clone()); } // Proceed. if !ctl.gen_opt.reprod && !v["productive"].as_bool().unwrap_or(false) { - return Ok(()); + return Ok(res); } if !ctl.gen_opt.reprod && !ctl.gen_opt.ncell && !v["high_confidence"].as_bool().unwrap_or(false) { - return Ok(()); + return Ok(res); } let tigname = v["contig_name"].to_string().between("\"", "\"").to_string(); let full_seq = &v["sequence"].to_string().between("\"", "\"").to_string(); @@ -175,8 +180,8 @@ fn parse_vector_entry_from_json( let mut cdr3_aa: String; let mut cdr3_dna: String; let mut cdr3_start: usize; - if v.get("version").is_some() { - *cr_version = v["version"].to_string().between("\"", "\"").to_string(); + if let Some(version) = v.get("version") { + res.cr_version = version.to_string().between("\"", "\"").to_string(); } // Read validated and non-validated UMIs. @@ -274,7 +279,7 @@ fn parse_vector_entry_from_json( ) { print!("{}", strme(&log)); println!("invalid"); - return Ok(()); + return Ok(res); } } else if !is_valid( &x, @@ -284,7 +289,7 @@ fn parse_vector_entry_from_json( &mut log, Some(ctl.gen_opt.gamma_delta), ) { - return Ok(()); + return Ok(res); } let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); get_cdr3_using_ann(&x, refdata, &ann, &mut cdr3); @@ -435,7 +440,7 @@ fn parse_vector_entry_from_json( } } if v_ref_id == 1000000 { - return Ok(()); + return Ok(res); } // Compute annv from cigarv. We don't compute the mismatch entry. @@ -495,14 +500,14 @@ fn parse_vector_entry_from_json( let x = DnaString::from_dna_string(full_seq); get_cdr3_using_ann(&x, refdata, &annv, &mut cdr3); if cdr3.is_empty() { - return Ok(()); + return Ok(res); } let cdr3_aa_alt = stringme(&cdr3[0].1); if cdr3_aa != cdr3_aa_alt { // This is particularly pathological and rare: if tig_start as usize > cdr3[0].0 { - return Ok(()); + return Ok(res); } // Define start. @@ -525,10 +530,10 @@ fn parse_vector_entry_from_json( // It is not known if these correspond to bugs in cellranger that were subsequently fixed. if cdr3_aa.contains('*') { - return Ok(()); + return Ok(res); } if cdr3_start + 3 * cdr3_aa.len() > tig_stop as usize - tig_start as usize { - return Ok(()); + return Ok(res); } // Keep going. @@ -606,7 +611,7 @@ fn parse_vector_entry_from_json( if invalidated_umis_present { invalu = Some(invalidated_umis); } - tigs.push(TigData { + res.tig = Some(TigData { cdr3_dna, len: seq.len(), v_start: tig_start, @@ -648,7 +653,7 @@ fn parse_vector_entry_from_json( invalidated_umis: invalu, frac_reads_used, }); - Ok(()) + Ok(res) } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ @@ -682,8 +687,7 @@ struct ReadJsonResult { gex_cells: Vec, gex_cells_specified: bool, cr_version: String, - tigs: Vec, - err: String, + tig_bc: Vec>, } fn read_json( @@ -694,13 +698,8 @@ fn read_json( refdata: &RefData, to_ref_index: &HashMap, reannotate: bool, - cr_version: &mut String, ctl: &EncloneControl, - vdj_cells: &mut Vec, - gex_cells: &mut Vec, - gex_cells_specified: &mut bool, -) -> Result>, String> { - *gex_cells_specified = false; +) -> Result { let mut tigs = Vec::::new(); let mut jsonx = json.clone(); if !path_exists(json) { @@ -741,51 +740,48 @@ fn read_json( } } } - let mut results = Vec::<(usize, ReadJsonResult)>::new(); - for i in 0..xs.len() { - results.push((i, Default::default())); - } let exiting = AtomicBool::new(false); - results.par_iter_mut().for_each(|(i, res)| { - let i = *i; - let resx = parse_vector_entry_from_json( - &xs[i], - json, - accept_inconsistent, - origin_info, - li, - refdata, - to_ref_index, - reannotate, - ctl, - &mut res.vdj_cells, - &mut res.gex_cells, - &mut res.gex_cells_specified, - &mut res.cr_version, - &mut res.tigs, - &exiting, - ); - if let Err(resx) = resx { - res.err = resx; - } - }); - for (_, result) in &results { - if !result.err.is_empty() { - return Err(result.err.clone()); + let results: Vec<_> = xs + .par_iter() + .map(|entry| { + parse_vector_entry_from_json( + entry, + json, + accept_inconsistent, + origin_info, + li, + refdata, + to_ref_index, + reannotate, + ctl, + &exiting, + ) + }) + .collect::, String>>()?; + + let mut tigs = Vec::new(); + let mut vdj_cells = Vec::new(); + let mut gex_cells = Vec::new(); + let mut gex_cells_specified = false; + let mut cr_version = String::new(); + for result in results { + if let Some(tig) = result.tig { + tigs.push(tig); + } + if let Some(c) = result.vdj_cell { + vdj_cells.push(c); + } + if let Some(c) = result.gex_cell { + gex_cells.push(c); } - } - for (_, result) in results.iter_mut().take(xs.len()) { - vdj_cells.append(&mut result.vdj_cells); - gex_cells.append(&mut result.gex_cells); if result.gex_cells_specified { - *gex_cells_specified = true; + gex_cells_specified = true; } if !result.cr_version.is_empty() { - *cr_version = result.cr_version.clone(); + cr_version = result.cr_version.clone(); } - tigs.append(&mut result.tigs); } - unique_sort(gex_cells); + unique_sort(&mut gex_cells); let mut tig_bc = Vec::>::new(); let mut r = 0; while r < tigs.len() { @@ -806,7 +802,7 @@ fn read_json( } r = s; } - unique_sort(vdj_cells); + unique_sort(&mut vdj_cells); // Subsample. @@ -820,106 +816,90 @@ fn read_json( if y < 1.0 - ctl.gen_opt.subsample { *del = true; let bc = &bc[0].barcode; - let p = bin_position(vdj_cells, bc); + let p = bin_position(&vdj_cells, bc); if p >= 0 { to_delete2[p as usize] = true; } - let p = bin_position(gex_cells, bc); + let p = bin_position(&gex_cells, bc); if p >= 0 { to_delete3[p as usize] = true; } } } erase_if(&mut tig_bc, &to_delete1); - erase_if(vdj_cells, &to_delete2); - erase_if(gex_cells, &to_delete3); + erase_if(&mut vdj_cells, &to_delete2); + erase_if(&mut gex_cells, &to_delete3); } // Done. - Ok(tig_bc) + Ok(ReadJsonResult { + vdj_cells, + gex_cells, + gex_cells_specified, + cr_version, + tig_bc, + }) } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -// Parse the JSON annotations file(s). -#[derive(Default)] -struct LoadResult { +pub struct Annotations { + vdj_cells: Vec>, + gex_cells: Vec>, + gex_cells_specified: Vec, tig_bc: Vec>, - cr_version: String, - vdj_cells: Vec, - gex_cells: Vec, - gex_cells_specified: bool, - err: String, + fate: Vec>, } pub fn parse_json_annotations_files( ctl: &EncloneControl, - tig_bc: &mut Vec>, refdata: &RefData, to_ref_index: &HashMap, - vdj_cells: &mut Vec>, - gex_cells: &mut Vec>, - gex_cells_specified: &mut Vec, - fate: &mut [HashMap], -) -> Result<(), String> { - let mut results = Vec::<(usize, LoadResult)>::new(); - for i in 0..ctl.origin_info.dataset_path.len() { - results.push((i, Default::default())); - } +) -> Result { // Note: only tracking truncated seq and quals initially let ann = if !ctl.gen_opt.cellranger { "all_contig_annotations.json" } else { "contig_annotations.json" }; - results.par_iter_mut().for_each(|(li, res)| { - let li = *li; - let json = format!("{}/{ann}", ctl.origin_info.dataset_path[li]); - let json_lz4 = format!("{}/{ann}.lz4", ctl.origin_info.dataset_path[li]); - if !path_exists(&json) && !path_exists(&json_lz4) { - res.err = format!("\ncan't find {json} or {json_lz4}\n"); - return; - } - let resx = read_json( - ctl.gen_opt.accept_inconsistent, - &ctl.origin_info, - li, - &json, - refdata, - to_ref_index, - ctl.gen_opt.reannotate, - &mut res.cr_version, - ctl, - &mut res.vdj_cells, - &mut res.gex_cells, - &mut res.gex_cells_specified, - ); - if let Ok(resx) = resx { - let tig_bc: Vec> = resx; - res.vdj_cells.sort(); - res.tig_bc = tig_bc; - } else { - res.err = resx.err().unwrap(); - } - }); - for (_, result) in &results { - if !result.err.is_empty() { - return Err(result.err.clone()); - } - } - let mut versions = Vec::::new(); - for (i, mut result) in results { - tig_bc.append(&mut result.tig_bc); - if result.cr_version.is_empty() { - versions.push("≤3.1".to_string()); - } else { - versions.push(result.cr_version); - } + let results = ctl + .origin_info + .dataset_path + .par_iter() + .enumerate() + .map(|(li, dataset_path)| { + let json = format!("{}/{ann}", dataset_path); + let json_lz4 = format!("{}/{ann}.lz4", dataset_path); + if !path_exists(&json) && !path_exists(&json_lz4) { + return Err(format!("\ncan't find {json} or {json_lz4}\n")); + } + read_json( + ctl.gen_opt.accept_inconsistent, + &ctl.origin_info, + li, + &json, + refdata, + to_ref_index, + ctl.gen_opt.reannotate, + ctl, + ) + .map(|r| (li, r)) + }) + .collect::, String>>()?; + + let mut ann = Annotations { + tig_bc: Default::default(), + vdj_cells: Default::default(), + gex_cells: Default::default(), + gex_cells_specified: Default::default(), + fate: vec![HashMap::::new(); ctl.origin_info.n()], + }; + for (i, result) in results { let cells = &result.vdj_cells; let mut found = vec![false; cells.len()]; - let tigs = result.tig_bc; + let tigs = &result.tig_bc; for tig in tigs { let p = bin_position(cells, &tig[0].barcode); if p >= 0 { @@ -928,13 +908,14 @@ pub fn parse_json_annotations_files( } for j in 0..found.len() { if !found[j] { - fate[i].insert(cells[j].clone(), BarcodeFate::NonProductive); + ann.fate[i].insert(cells[j].clone(), BarcodeFate::NonProductive); } } - vdj_cells.push(result.vdj_cells); - gex_cells.push(result.gex_cells); - gex_cells_specified.push(result.gex_cells_specified); + ann.tig_bc.extend(result.tig_bc.into_iter()); + ann.vdj_cells.push(result.vdj_cells); + ann.gex_cells.push(result.gex_cells); + ann.gex_cells_specified.push(result.gex_cells_specified); } - Ok(()) + Ok(ann) } From 8c21c0bc39f21de7358ea5b6c27cb5c1509a468a Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Fri, 1 Mar 2024 14:43:38 -0800 Subject: [PATCH 03/15] Finish cleaning up the JSON loading code. --- enclone_args/src/read_json.rs | 108 ++++++++++++++-------------------- enclone_stuff/src/start.rs | 25 +++----- 2 files changed, 54 insertions(+), 79 deletions(-) diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index eaf595c89..01c7c651d 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -29,12 +29,10 @@ use debruijn::dna_string::DnaString; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{EncloneControl, OriginInfo, TigData}; use io_utils::{open_maybe_compressed, path_exists, read_vector_entry_from_json}; -use itertools::Itertools; use rand::Rng; use rayon::prelude::*; use serde_json::Value; use std::fmt::Write; -use std::sync::atomic::{AtomicBool, Ordering}; use std::{collections::HashMap, io::BufReader}; use string_utils::{stringme, strme, TextUtils}; use vdj_ann::{annotate, refx, transcript}; @@ -42,54 +40,44 @@ use vector_utils::{bin_position, erase_if, unique_sort}; // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -fn json_error( - json: Option<&str>, - ctl: &EncloneControl, - exiting: &AtomicBool, - msg: &str, -) -> Result<(), String> { - // The following line prevents error messages from this function from being - // printed multiple times. - let mut msgx = String::new(); - if !exiting.swap(true, Ordering::Relaxed) { - msgx = "\nThere is something wrong with the contig annotations in the cellranger output \ - file" +fn json_error(json: Option<&str>, internal_run: bool, msg: &str) -> String { + let mut msgx = + "There is something wrong with the contig annotations in the cellranger output file" .to_string(); - if json.is_some() { - write!(msgx, "\n{}.", json.unwrap()).unwrap(); - } else { - msgx += "."; - } - if ctl.gen_opt.internal_run { - writeln!(msgx, "\n\npossibly relevant internal data: {msg}").unwrap(); - } - if ctl.gen_opt.internal_run { - msgx += "\n\nATTENTION INTERNAL 10X USERS!\n\ - Quite possibly you are using data from a cellranger run carried out using a \ - version\n\ - between 3.1 and 4.0. For certain of these versions, it is necessary to add the\n\ - argument CURRENT_REF to your command line. If that doesn't work, \ - please see below.\n"; - } - msgx += "\n\nHere is what you should do:\n\n\ - 1. If you used cellranger version ≥ 4.0, the problem is very likely\n\ - that the directory outs/vdj_reference was not retained, so enclone\n\ - didn't see it, and had to guess what the reference sequence was.\n\ - Fix this and everything should be fine.\n\n\ - 2. If you used cellranger version 3.1, then you need to add a command-line\n\ - argument REF=, or if you already did that,\n\ - make sure it is the *same* as that which you gave cellranger.\n\n\ - 3. If you used cellranger version < 3.1 (the only other possibility), then\n\ - you have options:\n\ - • rerun cellranger using the current version\n\ - • or provide an argument REF= as above and RE to force reannotation\n\ - • or provide the argument BUILT_IN to use the current reference and force\n \ - reannotation (and MOUSE if you used mouse); only works with human and mouse.\n\n\ - Note that one way to get the error is to specify TCR when you meant BCR, or the\n\ - other way.\n\n\ - If you're stuck, please write to us at enclone@10xgenomics.com.\n"; - } - Err(msgx) + if let Some(json) = json { + write!(msgx, "\n{}.", json).unwrap(); + } else { + msgx += "."; + } + if internal_run { + writeln!(msgx, "\n\npossibly relevant internal data: {msg}").unwrap(); + + msgx += "\n\nATTENTION INTERNAL 10X USERS!\n\ + Quite possibly you are using data from a cellranger run carried out using a \ + version\n\ + between 3.1 and 4.0. For certain of these versions, it is necessary to add the\n\ + argument CURRENT_REF to your command line. If that doesn't work, \ + please see below.\n"; + } + msgx += "\n\nHere is what you should do:\n\n\ + 1. If you used cellranger version ≥ 4.0, the problem is very likely\n\ + that the directory outs/vdj_reference was not retained, so enclone\n\ + didn't see it, and had to guess what the reference sequence was.\n\ + Fix this and everything should be fine.\n\n\ + 2. If you used cellranger version 3.1, then you need to add a command-line\n\ + argument REF=, or if you already did that,\n\ + make sure it is the *same* as that which you gave cellranger.\n\n\ + 3. If you used cellranger version < 3.1 (the only other possibility), then\n\ + you have options:\n\ + • rerun cellranger using the current version\n\ + • or provide an argument REF= as above and RE to force reannotation\n\ + • or provide the argument BUILT_IN to use the current reference and force\n \ + reannotation (and MOUSE if you used mouse); only works with human and mouse.\n\n\ + Note that one way to get the error is to specify TCR when you meant BCR, or the\n\ + other way.\n\n\ + If you're stuck, please write to us at enclone@10xgenomics.com.\n"; + + msgx } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ @@ -113,7 +101,6 @@ fn parse_vector_entry_from_json( to_ref_index: &HashMap, reannotate: bool, ctl: &EncloneControl, - exiting: &AtomicBool, ) -> Result { let mut res: JsonParseResult = Default::default(); let v: Value = match serde_json::from_slice(x) { @@ -374,10 +361,7 @@ fn parse_vector_entry_from_json( .to_string() .between("\"", "\"") .to_string(); - if refdata.name[feature_idx] != gene_name - && !accept_inconsistent - && !exiting.swap(true, Ordering::Relaxed) - { + if refdata.name[feature_idx] != gene_name && !accept_inconsistent { return Err(format!( "\nThere is an inconsistency between the reference \ file used to create the Cell Ranger output files in\n{}\nand the \ @@ -488,7 +472,7 @@ fn parse_vector_entry_from_json( let rt = &refdata.refs[v_ref_id]; if annv.len() == 2 && annv[0].1 as usize > rt.len() { let msg = format!("annv[0].1 = {}, rt.len() = {}", annv[0].1, rt.len()); - json_error(None, ctl, exiting, &msg)?; + return Err(json_error(None, ctl.gen_opt.internal_run, &msg)); } // Check to see if the CDR3 sequence has changed. This could happen if the cellranger @@ -540,7 +524,7 @@ fn parse_vector_entry_from_json( if tig_start < 0 || tig_stop < 0 { let msg = format!("tig_start = {tig_start}, tig_stop = {tig_stop}"); - json_error(Some(json), ctl, exiting, &msg)?; + return Err(json_error(Some(json), ctl.gen_opt.internal_run, &msg)); } let (tig_start, tig_stop) = (tig_start as usize, tig_stop as usize); let quals0 = v["quals"].to_string(); @@ -740,7 +724,6 @@ fn read_json( } } } - let exiting = AtomicBool::new(false); let results: Vec<_> = xs .par_iter() .map(|entry| { @@ -754,7 +737,6 @@ fn read_json( to_ref_index, reannotate, ctl, - &exiting, ) }) .collect::, String>>()?; @@ -845,11 +827,11 @@ fn read_json( // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ pub struct Annotations { - vdj_cells: Vec>, - gex_cells: Vec>, - gex_cells_specified: Vec, - tig_bc: Vec>, - fate: Vec>, + pub vdj_cells: Vec>, + pub gex_cells: Vec>, + pub gex_cells_specified: Vec, + pub tig_bc: Vec>, + pub fate: Vec>, } pub fn parse_json_annotations_files( diff --git a/enclone_stuff/src/start.rs b/enclone_stuff/src/start.rs index b537b6c41..62c4784c0 100644 --- a/enclone_stuff/src/start.rs +++ b/enclone_stuff/src/start.rs @@ -18,7 +18,7 @@ use enclone::join::join_exacts; use enclone::misc1::{cross_filter, lookup_heavy_chain_reuse}; use enclone::misc2::{check_for_barcode_reuse, find_exact_subclonotypes, search_for_shm_indels}; use enclone::misc3::sort_tig_bc; -use enclone_args::read_json::parse_json_annotations_files; +use enclone_args::read_json::{parse_json_annotations_files, Annotations}; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{AlleleData, CloneInfo, TigData}; use enclone_core::enclone_structs::{EncloneExacts, EncloneIntermediates, EncloneSetup}; @@ -121,21 +121,14 @@ pub fn main_enclone_start(setup: EncloneSetup) -> Result>::new(); - let mut vdj_cells = Vec::>::new(); - let mut gex_cells = Vec::>::new(); - let mut gex_cells_specified = Vec::::new(); - let mut fate = vec![HashMap::::new(); ctl.origin_info.n()]; - parse_json_annotations_files( - ctl, - &mut tig_bc, - refdata, - to_ref_index, - &mut vdj_cells, - &mut gex_cells, - &mut gex_cells_specified, - &mut fate, - )?; + + let Annotations { + mut tig_bc, + gex_cells, + gex_cells_specified, + vdj_cells, + mut fate, + } = parse_json_annotations_files(ctl, refdata, to_ref_index)?; ctl.perf_stats(&tparse, "loading from json"); // Populate features. From 082a674c31201aa4a57788101830027cb530d6d1 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Fri, 1 Mar 2024 14:46:38 -0800 Subject: [PATCH 04/15] unused var --- enclone_args/src/read_json.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index 01c7c651d..dea22b7bb 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -684,7 +684,6 @@ fn read_json( reannotate: bool, ctl: &EncloneControl, ) -> Result { - let mut tigs = Vec::::new(); let mut jsonx = json.clone(); if !path_exists(json) { jsonx = format!("{json}.lz4"); From ddad6262077e3d32decd4355a57707bb64b3d207 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Fri, 1 Mar 2024 16:45:53 -0800 Subject: [PATCH 05/15] Finish refactoring JSON parsing to use the type we already have... --- Cargo.lock | 1 + enclone_args/Cargo.toml | 1 + enclone_args/src/read_json.rs | 382 ++++++++++++---------------------- 3 files changed, 137 insertions(+), 247 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cdf177945..0aaf9b057 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -468,6 +468,7 @@ dependencies = [ "serde_json", "string_utils", "vdj_ann", + "vdj_types", "vector_utils", ] diff --git a/enclone_args/Cargo.toml b/enclone_args/Cargo.toml index cc16c5d5c..92a392fab 100644 --- a/enclone_args/Cargo.toml +++ b/enclone_args/Cargo.toml @@ -36,6 +36,7 @@ regex = { version = "1", default-features = false, features = ["std", "perf"] } serde_json = "1" string_utils = { path = "../string_utils" } vdj_ann = { path = "../vdj_ann" } +vdj_types = { path = "../vdj_types" } vector_utils = { path = "../vector_utils" } [target.'cfg(not(windows))'.dependencies.hdf5] diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index dea22b7bb..745ce2f27 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -1,41 +1,20 @@ // Copyright (c) 2021 10X Genomics, Inc. All rights reserved. -// Fields that are used in all_contig_annotations.json: -// • barcode -// • is_cell and is_asm_cell -- both are optional, but at least one needs to be present and -// true for a cell called by the VDJ pipeline -// • is_gex_cell -- optional -// • productive -- optional but should be true for contigs to be used -// • high_confidence -- optional but should be true for contigs to be used -// • contig_name -// • sequence -// • version -- optional -// • validated_umis -- optional -// • non_validated_umis -- optional -// • invalidated_umis -- optional -// • fraction_of_reads_for_this_barcode_provided_as_input_to_assembly -- optional -// • quals -// • umi_count -// • read_count -// • cdr3, unless in reannotate mode -// • cdr3_seq, unless in reannotate mode -// • cdr3_start, unless in reannotate mode -// • annotations, unless in reannotate mode. - use self::annotate::{annotate_seq, get_cdr3_using_ann, print_some_annotations}; use self::refx::RefData; use self::transcript::is_valid; use debruijn::dna_string::DnaString; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{EncloneControl, OriginInfo, TigData}; -use io_utils::{open_maybe_compressed, path_exists, read_vector_entry_from_json}; +use io_utils::{open_maybe_compressed, path_exists}; use rand::Rng; use rayon::prelude::*; -use serde_json::Value; +use std::collections::HashMap; use std::fmt::Write; -use std::{collections::HashMap, io::BufReader}; use string_utils::{stringme, strme, TextUtils}; +use vdj_ann::annotate::ContigAnnotation; use vdj_ann::{annotate, refx, transcript}; +use vdj_types::{VdjChain, VdjRegion}; use vector_utils::{bin_position, erase_if, unique_sort}; // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ @@ -87,48 +66,35 @@ struct JsonParseResult { vdj_cell: Option, gex_cell: Option, gex_cells_specified: bool, - cr_version: String, tig: Option, } fn parse_vector_entry_from_json( - x: &[u8], + ann: ContigAnnotation, json: &str, accept_inconsistent: bool, origin_info: &OriginInfo, - li: usize, + dataset_index: usize, refdata: &RefData, to_ref_index: &HashMap, reannotate: bool, ctl: &EncloneControl, ) -> Result { let mut res: JsonParseResult = Default::default(); - let v: Value = match serde_json::from_slice(x) { - Err(_) => { - return Err(format!( - "\nInternal error, failed to parse a value from a string. The string is:\n{}\n", - strme(x) - )); - } - Ok(v) => v, - }; - let barcode = v["barcode"].to_string().between("\"", "\"").to_string(); // Get cell status. Sometime after CR 4.0 was released, and before 4.1 was released, // we added new fields is_asm_cell and is_gex_cell to the json file. The value of // is_asm_cell is the original determination of "cell" in the VDJ pipeline, whereas the // value of is_gex_cell is that for the GEX pipeline. - - let mut is_cell = v["is_cell"].as_bool().unwrap_or(false); - let is_asm_cell = v["is_asm_cell"].as_bool().unwrap_or(false); - if is_asm_cell { + let mut is_cell = ann.is_cell; + if ann.is_asm_cell.is_some_and(|is_asm_cell| is_asm_cell) { is_cell = true; } - if let Some(is_gex_cell) = v["is_gex_cell"].as_bool() { + if let Some(is_gex_cell) = ann.is_gex_cell { res.gex_cells_specified = true; if is_gex_cell { - res.gex_cell = Some(barcode.clone()); + res.gex_cell = Some(ann.barcode.clone()); } } @@ -136,20 +102,17 @@ fn parse_vector_entry_from_json( return Ok(res); } if is_cell { - res.vdj_cell = Some(barcode.clone()); + res.vdj_cell = Some(ann.barcode.clone()); } // Proceed. - if !ctl.gen_opt.reprod && !v["productive"].as_bool().unwrap_or(false) { + if !ctl.gen_opt.reprod && !ann.productive.unwrap_or(false) { return Ok(res); } - if !ctl.gen_opt.reprod && !ctl.gen_opt.ncell && !v["high_confidence"].as_bool().unwrap_or(false) - { + if !ctl.gen_opt.reprod && !ctl.gen_opt.ncell && !ann.high_confidence { return Ok(res); } - let tigname = v["contig_name"].to_string().between("\"", "\"").to_string(); - let full_seq = &v["sequence"].to_string().between("\"", "\"").to_string(); let mut left = false; let (mut v_ref_id, mut j_ref_id) = (1000000, 0); let mut d_ref_id: Option = None; @@ -167,99 +130,62 @@ fn parse_vector_entry_from_json( let mut cdr3_aa: String; let mut cdr3_dna: String; let mut cdr3_start: usize; - if let Some(version) = v.get("version") { - res.cr_version = version.to_string().between("\"", "\"").to_string(); - } - // Read validated and non-validated UMIs. - - let mut validated_umis = Vec::::new(); - let mut validated_umis_present = false; - let val = v["validated_umis"].as_array(); - if let Some(val) = val { - validated_umis_present = true; - for vi in val { - validated_umis.push(vi.to_string().between("\"", "\"").to_string()); - } - } - let mut non_validated_umis = Vec::::new(); - let mut non_validated_umis_present = false; - let non_val = v["non_validated_umis"].as_array(); - if let Some(non_val) = non_val { - non_validated_umis_present = true; - for nv in non_val { - non_validated_umis.push(nv.to_string().between("\"", "\"").to_string()); - } - } - let mut invalidated_umis = Vec::::new(); - let mut invalidated_umis_present = false; - let inval = v["invalidated_umis"].as_array(); - if let Some(inval) = inval { - invalidated_umis_present = true; - for inv in inval { - invalidated_umis.push(inv.to_string().between("\"", "\"").to_string()); - } - } - - // Read fraction_of_reads_for_this_barcode_provided_as_input_to_assembly. - - let mut frac_reads_used = None; - let f = v["fraction_of_reads_for_this_barcode_provided_as_input_to_assembly"].as_f64(); - if let Some(f) = f { - frac_reads_used = Some((f * 1_000_000.0).round() as u32); - } + let frac_reads_used = ann + .fraction_of_reads_for_this_barcode_provided_as_input_to_assembly + .map(|f| (f * 1_000_000.0).round() as u32); // Reannotate. - if reannotate || ctl.gen_opt.reprod { - let x = DnaString::from_dna_string(full_seq); - let mut ann = Vec::<(i32, i32, i32, i32, i32)>::new(); - annotate_seq(&x, refdata, &mut ann, true, false, true); + let x = DnaString::from_dna_string(&ann.sequence); + let mut ann1 = Vec::<(i32, i32, i32, i32, i32)>::new(); + annotate_seq(&x, refdata, &mut ann1, true, false, true); // If there are multiple V segment alignments, possibly reduce to just one. let mut ann2 = Vec::<(i32, i32, i32, i32, i32)>::new(); let mut j = 0; - while j < ann.len() { - let t = ann[j].2 as usize; + while j < ann1.len() { + let t = ann1[j].2 as usize; let mut k = j + 1; - while k < ann.len() { - if refdata.segtype[ann[k].2 as usize] != refdata.segtype[t] { + while k < ann1.len() { + if refdata.segtype[ann1[k].2 as usize] != refdata.segtype[t] { break; } k += 1; } if refdata.segtype[t] == "V" && k - j > 1 { let mut entries = 1; - if j < ann.len() - 1 - && ann[j + 1].2 as usize == t - && ((ann[j].0 + ann[j].1 == ann[j + 1].0 && ann[j].3 + ann[j].1 < ann[j + 1].3) - || (ann[j].0 + ann[j].1 < ann[j + 1].0 - && ann[j].3 + ann[j].1 == ann[j + 1].3)) + if j < ann1.len() - 1 + && ann1[j + 1].2 as usize == t + && ((ann1[j].0 + ann1[j].1 == ann1[j + 1].0 + && ann1[j].3 + ann1[j].1 < ann1[j + 1].3) + || (ann1[j].0 + ann1[j].1 < ann1[j + 1].0 + && ann1[j].3 + ann1[j].1 == ann1[j + 1].3)) { entries = 2; } - ann2.extend(&ann[j..j + entries]); + ann2.extend(&ann1[j..j + entries]); } else { - ann2.extend(&ann[j..k]); + ann2.extend(&ann1[j..k]); } j = k; } - ann = ann2; + ann1 = ann2; // Proceed. - if ctl.gen_opt.trace_barcode == *barcode { + if ctl.gen_opt.trace_barcode == ann.barcode { let mut log = Vec::::new(); - print_some_annotations(refdata, &ann, &mut log, false); + print_some_annotations(refdata, &ann1, &mut log, false); print!("\n{}", strme(&log)); } let mut log = Vec::::new(); - if ctl.gen_opt.trace_barcode == *barcode { + if ctl.gen_opt.trace_barcode == ann.barcode { if !is_valid( &x, refdata, - &ann, + &ann1, true, &mut log, Some(ctl.gen_opt.gamma_delta), @@ -271,7 +197,7 @@ fn parse_vector_entry_from_json( } else if !is_valid( &x, refdata, - &ann, + &ann1, false, &mut log, Some(ctl.gen_opt.gamma_delta), @@ -279,14 +205,14 @@ fn parse_vector_entry_from_json( return Ok(res); } let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - get_cdr3_using_ann(&x, refdata, &ann, &mut cdr3); + get_cdr3_using_ann(&x, refdata, &ann1, &mut cdr3); cdr3_aa = stringme(&cdr3[0].1); cdr3_start = cdr3[0].0; cdr3_dna = x .slice(cdr3_start, cdr3_start + 3 * cdr3_aa.len()) .to_string(); let mut seen_j = false; - for anni in ann { + for anni in ann1 { let t = anni.2 as usize; if refdata.is_u(t) { u_ref_id = Some(t); @@ -305,7 +231,7 @@ fn parse_vector_entry_from_json( if tig_start > cdr3_start as isize { panic!( "Something is wrong with the CDR3 start for this contig:\n\n{}.", - &full_seq + ann.sequence ); } cdr3_start -= tig_start as usize; @@ -332,35 +258,31 @@ fn parse_vector_entry_from_json( } else { // Use annotations from json file. - cdr3_aa = v["cdr3"].to_string().between("\"", "\"").to_string(); - cdr3_dna = v["cdr3_seq"].to_string().between("\"", "\"").to_string(); - cdr3_start = v["cdr3_start"].as_u64().unwrap() as usize; - let ann = v["annotations"].as_array(); - if ann.is_none() { + cdr3_aa = ann.cdr3.unwrap(); + cdr3_dna = ann.cdr3_seq.unwrap(); + cdr3_start = ann.cdr3_start.unwrap(); + let annotations = ann.annotations; + if annotations.is_empty() { return Err(format!( "\nThe file\n{json}\ndoes not contain annotations. To use enclone with it, \ please specify the argument BUILT_IN\nto force use of the internal \ reference and recompute annotations.\n" )); } - let ann = ann.unwrap(); let mut cigarv = String::new(); // cigar for V segment - for a in ann { - let region_type = &a["feature"]["region_type"]; - let feature_id = a["feature"]["feature_id"].as_u64().unwrap() as usize; + for a in annotations { + let region_type = a.feature.region_type; + let feature_id = a.feature.feature_id; if !to_ref_index.contains_key(&feature_id) { continue; } let feature_idx = to_ref_index[&feature_id]; - let ref_start = a["annotation_match_start"].as_u64().unwrap() as usize; - if region_type == "L-REGION+V-REGION" { - v_stop = a["contig_match_end"].as_i64().unwrap() as usize; - v_stop_ref = a["annotation_match_end"].as_i64().unwrap() as usize; + let ref_start = a.annotation_match_start; + if region_type == VdjRegion::V { + v_stop = a.contig_match_end; + v_stop_ref = a.annotation_match_end; } - let gene_name = a["feature"]["gene_name"] - .to_string() - .between("\"", "\"") - .to_string(); + let gene_name = a.feature.gene_name; if refdata.name[feature_idx] != gene_name && !accept_inconsistent { return Err(format!( "\nThere is an inconsistency between the reference \ @@ -383,43 +305,39 @@ fn parse_vector_entry_from_json( refdata.name[feature_idx] )); } - if region_type == "L-REGION+V-REGION" && ref_start == 0 { - let chain = a["feature"]["chain"] - .to_string() - .between("\"", "\"") - .to_string(); - // if !chain.starts_with("IG") { continue; } // ******************* - tig_start = a["contig_match_start"].as_i64().unwrap() as isize; + if region_type == VdjRegion::V && ref_start == 0 { + let chain = a.feature.chain; + + tig_start = a.contig_match_start as isize; cdr3_start -= tig_start as usize; - chain_type = chain.clone(); - if chain == *"IGH" - || chain == *"TRB" - || (chain == *"TRD" && ctl.gen_opt.gamma_delta) + if chain == VdjChain::IGH + || chain == VdjChain::TRB + || (chain == VdjChain::TRD && ctl.gen_opt.gamma_delta) { left = true; } v_ref_id = feature_idx; - cigarv = a["cigar"].to_string().between("\"", "\"").to_string(); + cigarv = a.cigar; } else { // also check for IG chain????????????????????????????????????????? - let ref_stop = a["annotation_match_end"].as_u64().unwrap() as usize; - let ref_len = a["annotation_length"].as_u64().unwrap() as usize; - if region_type == "J-REGION" && ref_stop == ref_len { - tig_stop = a["contig_match_end"].as_i64().unwrap() as isize; + let ref_stop = a.annotation_match_end; + let ref_len = a.annotation_length; + if region_type == VdjRegion::J && ref_stop == ref_len { + tig_stop = a.contig_match_end as isize; j_ref_id = feature_idx; - j_start = a["contig_match_start"].as_i64().unwrap() as usize; - j_start_ref = a["annotation_match_start"].as_i64().unwrap() as usize; + j_start = a.contig_match_start; + j_start_ref = a.annotation_match_start; } - if region_type == "5'UTR" { + if region_type == VdjRegion::UTR { u_ref_id = Some(feature_idx); } - if region_type == "D-REGION" { - d_start = Some(a["contig_match_start"].as_i64().unwrap() as usize); + if region_type == VdjRegion::D { + d_start = Some(a.contig_match_start); d_ref_id = Some(feature_idx); } - if region_type == "C-REGION" { + if region_type == VdjRegion::C { c_ref_id = Some(feature_idx); - c_start = Some(a["contig_match_start"].as_i64().unwrap() as usize); + c_start = Some(a.contig_match_start); } } } @@ -481,7 +399,7 @@ fn parse_vector_entry_from_json( // inconsistencies, leading to an assert somewhere downstream. let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - let x = DnaString::from_dna_string(full_seq); + let x = DnaString::from_dna_string(&ann.sequence); get_cdr3_using_ann(&x, refdata, &annv, &mut cdr3); if cdr3.is_empty() { return Ok(res); @@ -527,11 +445,9 @@ fn parse_vector_entry_from_json( return Err(json_error(Some(json), ctl.gen_opt.internal_run, &msg)); } let (tig_start, tig_stop) = (tig_start as usize, tig_stop as usize); - let quals0 = v["quals"].to_string(); - let quals0 = quals0.after("\"").as_bytes(); let mut quals = Vec::::new(); let mut slashed = false; - for &qual in quals0.iter().take(quals0.len() - 1) { + for &qual in ann.quals.as_bytes() { if !slashed && qual == b'\\' /* && ( i == 0 || quals0[i-1] != b'\\' ) */ { @@ -541,36 +457,42 @@ fn parse_vector_entry_from_json( slashed = false; quals.push(qual); } - assert_eq!(full_seq.len(), quals.len()); - let seq = &full_seq[tig_start..tig_stop].to_string(); + assert_eq!(ann.sequence.len(), quals.len()); + let seq = &ann.sequence[tig_start..tig_stop].to_string(); for qual in quals.iter_mut() { *qual -= 33_u8; } let full_quals = quals; let quals = full_quals[tig_start..tig_stop].to_vec(); - let umi_count = v["umi_count"].as_i64().unwrap() as usize; - let read_count = v["read_count"].as_i64().unwrap() as usize; - let origin = origin_info.origin_for_bc[li].get(&barcode).or_else(|| { - // the way we use s1 here is flaky - if !origin_info.origin_id[li].is_empty() - && (origin_info.origin_id[li] != *"s1" || origin_info.origin_for_bc[li].is_empty()) - { - Some(&origin_info.origin_id[li]) - } else { - None - } - }); - let donor = origin_info.donor_for_bc[li].get(&barcode).or_else(|| { - // the way we use d1 here is flaky - if !origin_info.origin_id[li].is_empty() - && (origin_info.donor_id[li] != *"d1" || origin_info.donor_for_bc[li].is_empty()) - { - Some(&origin_info.donor_id[li]) - } else { - None - } - }); - let tag = origin_info.tag[li].get(&barcode); + let umi_count = ann.umi_count; + let read_count = ann.read_count; + let origin = origin_info.origin_for_bc[dataset_index] + .get(&ann.barcode) + .or_else(|| { + // the way we use s1 here is flaky + if !origin_info.origin_id[dataset_index].is_empty() + && (origin_info.origin_id[dataset_index] != *"s1" + || origin_info.origin_for_bc[dataset_index].is_empty()) + { + Some(&origin_info.origin_id[dataset_index]) + } else { + None + } + }); + let donor = origin_info.donor_for_bc[dataset_index] + .get(&ann.barcode) + .or_else(|| { + // the way we use d1 here is flaky + if !origin_info.origin_id[dataset_index].is_empty() + && (origin_info.donor_id[dataset_index] != *"d1" + || origin_info.donor_for_bc[dataset_index].is_empty()) + { + Some(&origin_info.donor_id[dataset_index]) + } else { + None + } + }); + let tag = origin_info.tag[dataset_index].get(&ann.barcode); let mut origin_index = None; let mut donor_index = None; let mut tag_index = None; @@ -583,18 +505,7 @@ fn parse_vector_entry_from_json( if let Some(tag) = tag { tag_index = Some(bin_position(&origin_info.tag_list, tag) as usize); } - let mut valu = None; - if validated_umis_present { - valu = Some(validated_umis); - } - let mut non_valu = None; - if non_validated_umis_present { - non_valu = Some(non_validated_umis); - } - let mut invalu = None; - if invalidated_umis_present { - invalu = Some(invalidated_umis); - } + res.tig = Some(TigData { cdr3_dna, len: seq.len(), @@ -606,7 +517,7 @@ fn parse_vector_entry_from_json( j_start_ref, j_stop: tig_stop, c_start, - full_seq: full_seq.as_bytes().to_vec(), + full_seq: ann.sequence.as_bytes().to_vec(), v_ref_id, d_ref_id, j_ref_id, @@ -621,10 +532,10 @@ fn parse_vector_entry_from_json( cdr3_start, quals, full_quals, - barcode, - tigname, + barcode: ann.barcode, + tigname: ann.contig_name, left, - dataset_index: li, + dataset_index, origin_index, donor_index, tag_index, @@ -632,9 +543,9 @@ fn parse_vector_entry_from_json( read_count, chain_type, annv, - validated_umis: valu, - non_validated_umis: non_valu, - invalidated_umis: invalu, + validated_umis: ann.validated_umis, + non_validated_umis: ann.non_validated_umis, + invalidated_umis: ann.invalidated_umis, frac_reads_used, }); Ok(res) @@ -644,40 +555,23 @@ fn parse_vector_entry_from_json( // Parse the JSON annotations file. // -// In the future could be converted to LazyWrite: -// https://martian-lang.github.io/martian-rust/doc/martian_filetypes/json_file/ -// index.html#lazy-readwrite-example. -// // Tracking contigs using bc_cdr3_aa; could improve later. // // This section requires 3.1. If you want to avoid that, do something to make tig_start // and tig_stop always nonnegative. Or use the RE option. -// -// Computational performance. It would appear that nearly all the time here is spent in -// two lines: -// -// read_vector_entry_from_json(&mut f) { -// let v: Value = serde_json::from_str(strme(&x)).unwrap(); -// (Should retest.) -// -// and simply reading the file lines is several times faster. So the way we parse the -// files is suboptimal. If we want to make this faster, one option would be to speed up -// this code. Another would be to write out a binary version of the JSON file that contains -// only the information that we need. #[derive(Default)] struct ReadJsonResult { vdj_cells: Vec, gex_cells: Vec, gex_cells_specified: bool, - cr_version: String, tig_bc: Vec>, } fn read_json( accept_inconsistent: bool, origin_info: &OriginInfo, - li: usize, + dataset_index: usize, json: &String, refdata: &RefData, to_ref_index: &HashMap, @@ -707,45 +601,43 @@ fn read_json( input files to enclone, including the PRE argument.\n" )); } - let mut f = BufReader::new(open_maybe_compressed(&jsonx)); - // ◼ This loop could be speeded up, see comments above. - let mut xs = Vec::>::new(); - loop { - let x = read_vector_entry_from_json(&mut f); - if x.is_err() { - eprintln!("\nProblem reading {jsonx}.\n"); - return Err(x.err().unwrap()); - } - match x.unwrap() { - None => break, - Some(x) => { - xs.push(x); - } - } - } - let results: Vec<_> = xs - .par_iter() - .map(|entry| { + // Read the entire file to memory before parsing. + // See https://github.com/serde-rs/json/issues/160 + // The previous implementation was essentially doing this anyway, so it + // shouldn't drastically change our memory consumption. + let mut contents = String::new(); + open_maybe_compressed(&jsonx) + .read_to_string(&mut contents) + .unwrap(); + + let mut results: Vec<_> = serde_json::Deserializer::from_str(&contents) + .into_iter::() + .enumerate() + .par_bridge() + .map(|(ann_index, entry)| { parse_vector_entry_from_json( - entry, + entry.unwrap(), json, accept_inconsistent, origin_info, - li, + dataset_index, refdata, to_ref_index, reannotate, ctl, ) + .map(|r| (ann_index, r)) }) .collect::, String>>()?; + // rayon's par_bridge feature doesn't preserve order, so sort the result + // for stable behavior. + results.sort_by_key(|(ann_index, _)| *ann_index); let mut tigs = Vec::new(); let mut vdj_cells = Vec::new(); let mut gex_cells = Vec::new(); let mut gex_cells_specified = false; - let mut cr_version = String::new(); - for result in results { + for (_, result) in results { if let Some(tig) = result.tig { tigs.push(tig); } @@ -758,9 +650,6 @@ fn read_json( if result.gex_cells_specified { gex_cells_specified = true; } - if !result.cr_version.is_empty() { - cr_version = result.cr_version.clone(); - } } unique_sort(&mut gex_cells); let mut tig_bc = Vec::>::new(); @@ -818,7 +707,6 @@ fn read_json( vdj_cells, gex_cells, gex_cells_specified, - cr_version, tig_bc, }) } From c9ea09e9f3d0910c43080993bd057597c3f5f4d2 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Fri, 1 Mar 2024 16:48:35 -0800 Subject: [PATCH 06/15] Delete the manually-implemented JSON loading function. --- io_utils/src/lib.rs | 79 --------------------------------------------- 1 file changed, 79 deletions(-) diff --git a/io_utils/src/lib.rs b/io_utils/src/lib.rs index f15a6b669..3b393a10a 100644 --- a/io_utils/src/lib.rs +++ b/io_utils/src/lib.rs @@ -251,85 +251,6 @@ pub fn get_metric_value(f: impl AsRef, metric: &str) -> String { String::default() } -// ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -// CODE FOR STREAMING A JSON VECTOR -// ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ - -// Read an entry from a json file that represents a vector. This is not completely -// general as it depends on assumptions about the formatting of the file. -// -// To compare to and probably replace with: -// https://martian-lang.github.io/martian-rust/doc/martian_filetypes/json_file/ -// index.html#lazy-readwrite-example - -pub fn read_vector_entry_from_json(json: &mut R) -> Result>, String> { - let mut line = String::new(); - if json.read_line(&mut line).is_err() || line == *"" || line == *"[]" { - return Ok(None); - } - if line == *"[\n" { - line.clear(); - if json.read_line(&mut line).is_err() { - return Err( - "\nProblem reading json file, probably due to a defect in it.\n".to_string(), - ); - } - } - let mut entry = Vec::::new(); - let (mut curlies, mut bracks, mut quotes) = (0_isize, 0_isize, 0_isize); - let mut s = line.as_bytes(); - loop { - if (s == b"]" || s == b"]\n") && curlies == 0 && bracks == 0 && quotes % 2 == 0 { - if !entry.is_empty() { - return Ok(Some(entry)); - } else { - return Ok(None); - } - } - let mut cpos = -1_isize; - if s.is_empty() { - return Err("\nError reading json file. It is possible that the file \ - was truncated.\n" - .to_string()); - } - for i in (0..s.len() - 1).rev() { - if s[i] == b',' { - cpos = i as isize; - break; - } - if s[i] != b' ' { - break; - } - } - let mut escaped = false; - for i in 0..s.len() { - if !escaped && s[i] == b'"' { - quotes += 1; - } else if !escaped && quotes % 2 == 0 { - match s[i] { - b'{' => curlies += 1, - b'}' => curlies -= 1, - b'[' => bracks += 1, - b']' => bracks -= 1, - b',' => { - if i as isize == cpos && curlies == 0 && bracks == 0 && quotes % 2 == 0 { - return Ok(Some(entry)); - } - } - _ => {} - }; - } - escaped = s[i] == b'\\' && !escaped; - entry.push(s[i]); - } - line.clear(); - if json.read_line(&mut line).is_err() { - return Err("\nSomething appears to be defective in a json file.\n".to_string()); - } - s = line.as_bytes(); - } -} - // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ // READ FILE TO STRING AND PRINT FILE NAME IF IT DOESN'T EXIST // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ From f81ed7c965bd2e168807c048e7b6ab227fdce001 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Mon, 4 Mar 2024 15:48:30 -0800 Subject: [PATCH 07/15] Replace lazy JSON reading with martian filetypes. --- Cargo.lock | 170 +++++++++++++++++++++++++++++++--- enclone_args/Cargo.toml | 1 + enclone_args/src/read_json.rs | 35 +++---- 3 files changed, 170 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0aaf9b057..94554fe38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" dependencies = [ "gimli", ] @@ -64,6 +64,9 @@ name = "anyhow" version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26fa4d7e3f2eebadf743988fc8aec9fa9a9e82611acafd77c1462ed6262440a" +dependencies = [ + "backtrace", +] [[package]] name = "approx" @@ -117,9 +120,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "backtrace" -version = "0.3.68" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" dependencies = [ "addr2line", "cc", @@ -354,6 +357,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "debruijn" version = "0.3.4" @@ -380,6 +404,12 @@ dependencies = [ "uuid", ] +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" + [[package]] name = "derive-new" version = "0.5.9" @@ -462,6 +492,7 @@ dependencies = [ "hdf5", "io_utils", "itertools", + "martian-filetypes", "rand", "rayon", "regex", @@ -673,6 +704,15 @@ dependencies = [ "instant", ] +[[package]] +name = "fern" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9f0c14694cbd524c8720dd69b0e3179344f04ebb5f90f2e4a440c6ea3b2f1ee" +dependencies = [ + "log", +] + [[package]] name = "filetime" version = "0.2.19" @@ -752,9 +792,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.27.3" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "graph_simple" @@ -1031,6 +1071,52 @@ dependencies = [ "libc", ] +[[package]] +name = "martian" +version = "0.26.0" +source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21" +dependencies = [ + "anyhow", + "backtrace", + "fern", + "heck", + "log", + "rustc_version", + "serde", + "serde_json", + "tempfile", + "time", +] + +[[package]] +name = "martian-derive" +version = "0.26.0" +source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21" +dependencies = [ + "martian", + "proc-macro2", + "quote", + "serde", + "syn 2.0.52", +] + +[[package]] +name = "martian-filetypes" +version = "0.27.0" +source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21" +dependencies = [ + "anyhow", + "bincode", + "csv", + "flate2", + "lz4", + "martian", + "martian-derive", + "serde", + "serde_json", + "zstd", +] + [[package]] name = "matches" version = "0.1.9" @@ -1202,11 +1288,20 @@ dependencies = [ "libm", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "object" -version = "0.31.1" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] @@ -1385,7 +1480,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", - "syn 2.0.50", + "syn 2.0.52", ] [[package]] @@ -1479,7 +1574,7 @@ dependencies = [ "prost 0.12.3", "prost-types 0.12.3", "regex", - "syn 2.0.50", + "syn 2.0.52", "tempfile", "which", ] @@ -1507,7 +1602,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.50", + "syn 2.0.52", ] [[package]] @@ -1663,9 +1758,18 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.21" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] [[package]] name = "rustix" @@ -1740,6 +1844,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "semver" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" + [[package]] name = "serde" version = "1.0.156" @@ -1904,9 +2014,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.50" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f1bdc9872430ce9b75da68329d1c1746faf50ffac5f19e02b71e37ff881ffb" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -1981,6 +2091,36 @@ dependencies = [ "libc", ] +[[package]] +name = "time" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a79d09ac6b08c1ab3906a2f7cc2e81a0e27c7ae89c63812df75e52bef0751e07" +dependencies = [ + "deranged", + "itoa", + "libc", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75c65469ed6b3a4809d987a41eb1dc918e9bc1d92211cbad7ae82931846f7451" +dependencies = [ + "time-core", +] + [[package]] name = "tinyvec" version = "1.6.0" diff --git a/enclone_args/Cargo.toml b/enclone_args/Cargo.toml index 92a392fab..d634a1ffc 100644 --- a/enclone_args/Cargo.toml +++ b/enclone_args/Cargo.toml @@ -30,6 +30,7 @@ evalexpr = ">=7, <12" expr_tools = { path = "../expr_tools" } io_utils = { path = "../io_utils" } itertools.workspace = true +martian-filetypes = { git = "https://github.com/martian-lang/martian-rust", branch = "master" } rand = "0.8" rayon = "1" regex = { version = "1", default-features = false, features = ["std", "perf"] } diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index 745ce2f27..8daa80e9d 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -7,10 +7,13 @@ use debruijn::dna_string::DnaString; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{EncloneControl, OriginInfo, TigData}; use io_utils::{open_maybe_compressed, path_exists}; +use martian_filetypes::json_file::{Json, LazyJsonReader}; +use martian_filetypes::LazyRead; use rand::Rng; use rayon::prelude::*; use std::collections::HashMap; use std::fmt::Write; +use std::io::BufReader; use string_utils::{stringme, strme, TextUtils}; use vdj_ann::annotate::ContigAnnotation; use vdj_ann::{annotate, refx, transcript}; @@ -69,7 +72,7 @@ struct JsonParseResult { tig: Option, } -fn parse_vector_entry_from_json( +fn process_json_annotation( ann: ContigAnnotation, json: &str, accept_inconsistent: bool, @@ -601,21 +604,15 @@ fn read_json( input files to enclone, including the PRE argument.\n" )); } - // Read the entire file to memory before parsing. - // See https://github.com/serde-rs/json/issues/160 - // The previous implementation was essentially doing this anyway, so it - // shouldn't drastically change our memory consumption. - let mut contents = String::new(); - open_maybe_compressed(&jsonx) - .read_to_string(&mut contents) - .unwrap(); - - let mut results: Vec<_> = serde_json::Deserializer::from_str(&contents) - .into_iter::() - .enumerate() - .par_bridge() - .map(|(ann_index, entry)| { - parse_vector_entry_from_json( + + let reader: LazyJsonReader<_, Json, _> = + LazyJsonReader::with_reader(BufReader::new(open_maybe_compressed(&jsonx))) + .map_err(|err| format!("{err:#?}"))?; + + let results: Vec<_> = reader + .into_iter() + .map(|entry| { + process_json_annotation( entry.unwrap(), json, accept_inconsistent, @@ -626,18 +623,14 @@ fn read_json( reannotate, ctl, ) - .map(|r| (ann_index, r)) }) .collect::, String>>()?; - // rayon's par_bridge feature doesn't preserve order, so sort the result - // for stable behavior. - results.sort_by_key(|(ann_index, _)| *ann_index); let mut tigs = Vec::new(); let mut vdj_cells = Vec::new(); let mut gex_cells = Vec::new(); let mut gex_cells_specified = false; - for (_, result) in results { + for result in results { if let Some(tig) = result.tig { tigs.push(tig); } From 9ab8d960bdf3a15225bd2055d222799f1d8448af Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Mon, 4 Mar 2024 16:00:06 -0800 Subject: [PATCH 08/15] Add syn to deny.toml. --- deny.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deny.toml b/deny.toml index 98dbe080c..a3e328104 100644 --- a/deny.toml +++ b/deny.toml @@ -139,3 +139,9 @@ github = ["10XGenomics"] gitlab = [] # 1 or more bitbucket.org organizations to allow git sources for bitbucket = [] + + +[[bans.skip]] +# many packages depend on syn 1 +name = "syn" +version = "1.0.105" From 245ca15d5d7093026ca46321116e070a81382752 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Mon, 4 Mar 2024 16:05:28 -0800 Subject: [PATCH 09/15] Allow martian-lang git repos. --- deny.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/deny.toml b/deny.toml index a3e328104..89e15694c 100644 --- a/deny.toml +++ b/deny.toml @@ -130,6 +130,7 @@ allow-registry = ["https://github.com/rust-lang/crates.io-index"] allow-git = [ # TODO: remove this "https://github.com/Barandis/qd", + "https://github.com/martian-lang/martian-rust", ] [sources.allow-org] From ad13f638fe5900e5e401842228e066d2527ef353 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Mon, 4 Mar 2024 16:38:49 -0800 Subject: [PATCH 10/15] Remove the not-needed manual de-escaping code. --- enclone_args/src/read_json.rs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index 8daa80e9d..702177869 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -448,21 +448,10 @@ fn process_json_annotation( return Err(json_error(Some(json), ctl.gen_opt.internal_run, &msg)); } let (tig_start, tig_stop) = (tig_start as usize, tig_stop as usize); - let mut quals = Vec::::new(); - let mut slashed = false; - for &qual in ann.quals.as_bytes() { - if !slashed && qual == b'\\' - /* && ( i == 0 || quals0[i-1] != b'\\' ) */ - { - slashed = true; - continue; - } - slashed = false; - quals.push(qual); - } - assert_eq!(ann.sequence.len(), quals.len()); + let mut quals = ann.quals.as_bytes().to_vec(); + assert_eq!(ann.sequence.len(), ann.quals.as_bytes().len()); let seq = &ann.sequence[tig_start..tig_stop].to_string(); - for qual in quals.iter_mut() { + for qual in &mut quals { *qual -= 33_u8; } let full_quals = quals; From bdd34395d66fdd6181429df5a14e201ad540bf37 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Tue, 5 Mar 2024 15:59:44 -0800 Subject: [PATCH 11/15] Re-add missing chain_type setter. --- enclone_args/src/read_json.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index 702177869..1fc6f03b2 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -310,7 +310,7 @@ fn process_json_annotation( } if region_type == VdjRegion::V && ref_start == 0 { let chain = a.feature.chain; - + chain_type = chain.to_string(); tig_start = a.contig_match_start as isize; cdr3_start -= tig_start as usize; if chain == VdjChain::IGH From 4a35a0c0208cbc54121951caf44f88d0ef258d4b Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Tue, 5 Mar 2024 16:12:20 -0800 Subject: [PATCH 12/15] Load annotations as empty vec if missing. --- vdj_ann/src/annotate.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index 38bd005f5..0fecda3ae 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -3007,9 +3007,10 @@ pub struct ContigAnnotation { pub fwr4: Option, // annotations + #[serde(default)] pub annotations: Vec, // the annotations - pub clonotype: Option, // null, filled in later - pub info: ClonotypeInfo, // Empty initially, may be filled in later + pub clonotype: Option, // null, filled in later + pub info: ClonotypeInfo, // Empty initially, may be filled in later // state of the contig pub high_confidence: bool, // declared high confidence? From a5cdba03f058aebedf6e9796befdd4bf14f0b248 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Tue, 5 Mar 2024 16:22:24 -0800 Subject: [PATCH 13/15] Load info as default if missing. --- vdj_ann/src/annotate.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index 0fecda3ae..527f32914 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -3010,7 +3010,8 @@ pub struct ContigAnnotation { #[serde(default)] pub annotations: Vec, // the annotations pub clonotype: Option, // null, filled in later - pub info: ClonotypeInfo, // Empty initially, may be filled in later + #[serde(default)] + pub info: ClonotypeInfo, // Empty initially, may be filled in later // state of the contig pub high_confidence: bool, // declared high confidence? From dcf69f9727a3776a6936ff16bed8f370ddc51d9e Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Tue, 5 Mar 2024 16:26:52 -0800 Subject: [PATCH 14/15] Load filtered as default if missing. --- vdj_ann/src/annotate.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index 527f32914..adc59d263 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -3020,7 +3020,8 @@ pub struct ContigAnnotation { pub invalidated_umis: Option>, // invalidated UMIs pub is_cell: bool, // was the barcode declared a cell? pub productive: Option, // productive? (null means not full length) - pub filtered: bool, // true and never changed (unused field) + #[serde(default = "set_true")] + pub filtered: bool, // true and never changed (unused field) pub is_gex_cell: Option, // Was the barcode declared a cell by Gene expression data, if available pub is_asm_cell: Option, // Was the barcode declared a cell by the VDJ assembler @@ -3032,6 +3033,10 @@ pub struct ContigAnnotation { pub sample: Option, } +fn set_true() -> bool { + true +} + impl ContigAnnotation { // Given the alignment entities produced by annotate_seq, produce a // ContigAnnotation. This is done so as to produce at most one V, D, J and C, From 644438be90174e76c9bda6d68f36f47bd9109511 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 6 Mar 2024 12:15:25 -0800 Subject: [PATCH 15/15] Refactor JSON loading to avoid collecting into an intermediate vector. --- enclone_args/src/read_json.rs | 45 ++++++++++++++++------------------- enclone_stuff/src/start.rs | 2 +- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index 1fc6f03b2..c08ebec52 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -27,7 +27,7 @@ fn json_error(json: Option<&str>, internal_run: bool, msg: &str) -> String { "There is something wrong with the contig annotations in the cellranger output file" .to_string(); if let Some(json) = json { - write!(msgx, "\n{}.", json).unwrap(); + write!(msgx, "\n{json}.").unwrap(); } else { msgx += "."; } @@ -594,32 +594,27 @@ fn read_json( )); } - let reader: LazyJsonReader<_, Json, _> = - LazyJsonReader::with_reader(BufReader::new(open_maybe_compressed(&jsonx))) - .map_err(|err| format!("{err:#?}"))?; - - let results: Vec<_> = reader - .into_iter() - .map(|entry| { - process_json_annotation( - entry.unwrap(), - json, - accept_inconsistent, - origin_info, - dataset_index, - refdata, - to_ref_index, - reannotate, - ctl, - ) - }) - .collect::, String>>()?; - let mut tigs = Vec::new(); let mut vdj_cells = Vec::new(); let mut gex_cells = Vec::new(); let mut gex_cells_specified = false; - for result in results { + + let reader: LazyJsonReader = + LazyJsonReader::with_reader(BufReader::new(open_maybe_compressed(&jsonx))) + .map_err(|err| format!("{err:#?}"))?; + + for entry in reader.into_iter() { + let result = process_json_annotation( + entry.map_err(|err| err.to_string())?, + json, + accept_inconsistent, + origin_info, + dataset_index, + refdata, + to_ref_index, + reannotate, + ctl, + )?; if let Some(tig) = result.tig { tigs.push(tig); } @@ -720,8 +715,8 @@ pub fn parse_json_annotations_files( .par_iter() .enumerate() .map(|(li, dataset_path)| { - let json = format!("{}/{ann}", dataset_path); - let json_lz4 = format!("{}/{ann}.lz4", dataset_path); + let json = format!("{dataset_path}/{ann}"); + let json_lz4 = format!("{dataset_path}/{ann}.lz4"); if !path_exists(&json) && !path_exists(&json_lz4) { return Err(format!("\ncan't find {json} or {json_lz4}\n")); } diff --git a/enclone_stuff/src/start.rs b/enclone_stuff/src/start.rs index 62c4784c0..52ae52f93 100644 --- a/enclone_stuff/src/start.rs +++ b/enclone_stuff/src/start.rs @@ -20,7 +20,7 @@ use enclone::misc2::{check_for_barcode_reuse, find_exact_subclonotypes, search_f use enclone::misc3::sort_tig_bc; use enclone_args::read_json::{parse_json_annotations_files, Annotations}; use enclone_core::barcode_fate::BarcodeFate; -use enclone_core::defs::{AlleleData, CloneInfo, TigData}; +use enclone_core::defs::{AlleleData, CloneInfo}; use enclone_core::enclone_structs::{EncloneExacts, EncloneIntermediates, EncloneSetup}; use enclone_core::hcomp::heavy_complexity; use enclone_print::define_mat::{define_mat, setup_define_mat};