From e048eef7483c21deaae34acaf2e6ac21074c79ba Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 28 Feb 2024 15:58:16 -0800 Subject: [PATCH 1/7] Remove bin matrix loading, refactor massive tuple into a struct. --- enclone/src/UNDOC_OPTIONS | 83 +++-- enclone_args/src/load_gex_core.rs | 410 +++++++---------------- enclone_args/src/process_special_arg1.rs | 4 - enclone_core/src/defs.rs | 1 - enclone_core/src/slurp.rs | 19 -- mirror_sparse_matrix/src/lib.rs | 33 -- 6 files changed, 170 insertions(+), 380 deletions(-) diff --git a/enclone/src/UNDOC_OPTIONS b/enclone/src/UNDOC_OPTIONS index b2e2193f1..6b480bce2 100644 --- a/enclone/src/UNDOC_OPTIONS +++ b/enclone/src/UNDOC_OPTIONS @@ -1,11 +1,12 @@ -This file contains enclone options that are not in the online documentation for enclone. They are -here for a reason, for example, maybe they are buggy, or untested, or of dubious utility, or +This file contains enclone options that are not in the online documentation for enclone. They are +here for a reason, for example, maybe they are buggy, or untested, or of dubious utility, or intended for debugging purposes only, or only work from a 10x server, etc. Use at your own risk, ha ha. Optional arguments controlling printing of join events: + - SEQ: print sequences of contigs, before truncation to V..J - ANN: print annotations of contig - ANN0: print annotations of contig, after truncation to V..J @@ -15,21 +16,25 @@ Optional arguments controlling printing of join events: - FAILS_ONLY: only print failed clonotypes. Optional arguments that control clonotype filtering: + - WHITEF: only show clonotypes that exhibit whitelist contamination - PROTECT_BADS: don't delete putatively bad stuff - VDUP: only show clonotypes having two chains with the same V segment - HAVE_ONESIE: only show clonotypes including a onesie exact subclonotype Optional arguments that control miscellaneous logging: + - DESCRIP: print sample descriptions, and also reference info - REUSE: print barcode reuse (that might arise from index hopping) - EXACT=n: print some data about exact subclonotype n. Optional arguments controlling logging for alternate alleles: + - CON: print alternate consensus sequences - CON_TRACE: tracing for CON. Optional arguments that control the joining algorithm: + - MIN_MULT: to document - MIN_ALT: to document - BCJOIN: substitute an algorithm that joins by barcode identity and ignores everything @@ -37,16 +42,19 @@ Optional arguments that control the joining algorithm: different enrichment protocols. Optional arguments governing input and output files: + - EXFASTA=f: dump fasta file f for V..J exact subclonotypes. Optional arguments that control printing of individual clonotypes: + - white = percent of sequences implicated in whitelist expansion. - CHAIN_BRIEF: show abbreviated chain column headers - DEBUG_TABLE_PRINTING: add print lines to help debug printing of tables. -- NOTE_SIMPLE: note if the first sequence for the chain is simple, in the sense that it exactly +- NOTE_SIMPLE: note if the first sequence for the chain is simple, in the sense that it exactly equals the concatenation of the right-truncated V with the full J segment. Other optional arguments: + - FORCE: make joins even if redundant - EXP: exploratory code for exact clonotyping on - WEAK: for EXP, print all and show weaks @@ -67,22 +75,22 @@ Other optional arguments: - DUMP_INTERNAL_IDS: special option to dump the list of internal ids and exit. - TOY: toy with phylogeny -If an argument is a nonnegative integer, it is treated as if specified by TCR= or BCR=, but -without setting one of them. Nonnegative integers are recognized as 10x dataset identifiers -("sample IDs") and their location is automatically found. Inclusive ranges l1-l2 are +If an argument is a nonnegative integer, it is treated as if specified by TCR= or BCR=, but +without setting one of them. Nonnegative integers are recognized as 10x dataset identifiers +("sample IDs") and their location is automatically found. Inclusive ranges l1-l2 are expanded out. CELLRANGER: for use if called from cellranger -- changes failure message and prevents exit upon normal completion EXT=filename: -Given output of an external clonotyping algorithm which took as inputs the pipeline outputs -for the lenas in enclone.testdata, for each exact subclonotype found by enclone, report its +Given output of an external clonotyping algorithm which took as inputs the pipeline outputs +for the lenas in enclone.testdata, for each exact subclonotype found by enclone, report its composition in the external clonotyping, as clonotype_id[count], ... The input file should have lines of the form: sample barcode clonotype_id. -SUMMARY_CLEAN: if SUMMARY specified, don't show computational performance stats, so +SUMMARY_CLEAN: if SUMMARY specified, don't show computational performance stats, so we can regress on output ACCEPT_INCONSISTENT: temporary option to accept inconsistent reference/annotations and feature refs @@ -90,24 +98,25 @@ ACCEPT_INCONSISTENT: temporary option to accept inconsistent reference/annotatio CURRENT_REF: temp option to force CR 4.0 reference H5_SLICE: read GEX data slice by slice rather than preloading -H5: force use of H5, even if feature_barcode_matrix.bin has been built FORCE_EXTERNAL: do not test for internal run STABLE_DOC: print documentation in a way that it won't change when the git version changes LVARS entries -* entropy = for each cell, take the sum over all genes of -q/log2(q), where q is the fraction of + +- entropy = for each cell, take the sum over all genes of -q/log2(q), where q is the fraction of total gene UMIs in the given gene; for an exact subclonotype report the median of these values -* clust = cluster id, from analysis_csv/clustering/graphclust/clusters.csv +- clust = cluster id, from analysis_csv/clustering/graphclust/clusters.csv (for now, has to be in that exact place; not implemented for PCELL) -* type = cell type, from analysis_csv/celltypes/celltypes.csv +- type = cell type, from analysis_csv/celltypes/celltypes.csv Note that it's not clear how doublets are handled. -* pe = PCA equivalence class at distance ≤ n, where n is an integer: it shows the equivalence +- pe = PCA equivalence class at distance ≤ n, where n is an integer: it shows the equivalence class on all cells in a single clonotype when two cells are called equivalent if their distance in PCA space is ≤ n. This is only implemented for PER_CELL. This uses the file analysis_csv/pca/10_components/projection.csv. WARNING: this only makes sense if you have one dataset--this condition is not checked. + - npe = total number of cells in this clonotype that are within PCA distance n of this cell - ppe = percent of all gex cells within PCA distance n of this cell that are in this clonotype @@ -118,6 +127,7 @@ IMGT: specify the IMGT reference, forces RE so slow IMGT_FIX: make certain hardcoded fixes to the IMGT reference. SVG: render output as svg, clonotype by clonotype. There are problems with this: + 1. Safari does not correctly render these svg files. 2. The svg files use the Menlo font, which works on a Mac but is not universally available. 3. We don't render background, so this is broken for PER_CELL. @@ -127,6 +137,7 @@ TRACE_BARCODE: print some additional information for this particular barcode SUMMARY_CSV: nascent option to emit some summary information as CSV output A complex of related experimental stuff: + - BASELINE: show stats relevant to NUMI and the the following two options - UMI_FILT_MARK: mark cells based on these stats; view with undocumented lvar "mark" and only viewable with PER_CELL @@ -142,8 +153,9 @@ ECHOC: echo command, preceeded by "# ", and don't emit blank line --------------------------------------------------------------------------------------------------- MARK_STATS: generate three statistics (assumes GEX data included): -1. number of dubious cells, where dubious means not classified as a B cell, or in a clonotype for - which the ratio of the top dataset cells to the runner up dataset cells is at least 10 + +1. number of dubious cells, where dubious means not classified as a B cell, or in a clonotype for + which the ratio of the top dataset cells to the runner up dataset cells is at least 10 (e.g. 3:0 works as does 10:1) 2. number of cells that are marked (in selected clonotypes) 3. of these, number that are "good", defined as @@ -152,16 +164,17 @@ MARK_STATS: generate three statistics (assumes GEX data included): - and which have 2 or 3 chains. MARK_STATS2: generate four statistics (assumes GEX data included): + 1. number of fake expanded clonotype cells, defined as all but one cell in clonotypes for which the ratio of the top dataset cells to the runner up dataset cells is at least 10 2. number of these cells that are marked by the filter (ADVANTAGE) -3. number of good expanded clonotype cells, defined as cells in clonotypes having at least 10 +3. number of good expanded clonotype cells, defined as cells in clonotypes having at least 10 cells, called a B cell by GEX, having 2 or 3 chains, and not fake as above 2. number of these cells that are marked by the filter (DISADVANTAGE) [The motivation for this, as distinct from MARK_STATS, is that (1) now consists of just cells in fake expanded clonotypes, and not just all dubious cells, would could be a substantially larger set. This allows the analysis of the effect of a filter on fake expanded clonotypes.] -*** THIS SHOULD BE RUN WITH NGEX AND NCROSS. *** +***THIS SHOULD BE RUN WITH NGEX AND NCROSS.*** --------------------------------------------------------------------------------------------------- @@ -185,7 +198,7 @@ NALL_GEX: turn off all filters except GEX filter REPROD: accept nonproductive contigs, recompute productive, then reject nonproductive -SPLIT_COMMAND: assuming that BCR and GEX have been provided with multiple entries each, split out +SPLIT_COMMAND: assuming that BCR and GEX have been provided with multiple entries each, split out separate commands having them specified one by one, and run them, then exit OPTIONS THAT ALLOW METRIC VALUES TO BE LOCKED @@ -228,26 +241,28 @@ All three classes of UMIs are capped at 20 UMIs. =================================================================================================== CLONOTYPE_GROUP_NAMES=filename -* The file should be a a CSV file with that includes fields new_group_name and group_id. This is + +- The file should be a a CSV file with that includes fields new_group_name and group_id. This is confusing nomenclature, see what follows. -* The group_id should be obtained by running enclone WITHOUT grouping, so that there is exactly +- The group_id should be obtained by running enclone WITHOUT grouping, so that there is exactly one clonotype per group. -* The new_group_name is the name of a group (of your own concoction) that you are assigning a +- The new_group_name is the name of a group (of your own concoction) that you are assigning a clonotype to. -* A clonotype may not be assigned to more than one group. -* The idea is that one would run enclone once to gather information that would +- A clonotype may not be assigned to more than one group. +- The idea is that one would run enclone once to gather information that would allow grouping of clonotypes. Then run it as second time with this argument. -* The argument only does something if honeycomb plotting is used. In that case, the honeycomb +- The argument only does something if honeycomb plotting is used. In that case, the honeycomb plot is split by group and each group is given a different (light) background color. -* Clonotypes that are not assigned to a group are not displayed. +- Clonotypes that are not assigned to a group are not displayed. =================================================================================================== read utilization - This is a stat generated with the SUMMARY option, in internal mode. It is the fraction of reads that are assigned to chains by enclone. It has the following issues: + 1. It doesn't account for reads that are lost because of capping in assembly. Internally, we can work around this by removing the capping on a cellranger branch. (Done now.) -2. It would not be correct if read one was significantly longer than 28 bases, so that it could +2. It would not be correct if read one was significantly longer than 28 bases, so that it could contribute to assembly. NO_UNCAP_SIM: turn off uncapping simulation @@ -263,7 +278,7 @@ This won't work with paging, however you can pipe to "less -r". ROW_FILL_VERBOSE: special option for debugging -TOP_GENES: list genes having the highest expression in the selected clonotypes, taking the median +TOP_GENES: list genes having the highest expression in the selected clonotypes, taking the median across all cells (only implemented if .bin file has been generated using NH5) COMPE: COMP, plus enforce no unaccounted time (except up to 0.02 seconds) @@ -274,10 +289,11 @@ TOY_COM: compute clonotypes, then act as toy server, which can speak to enclone_ TOOLTIP: add tooltip text to honeycomb plots -ALIGN_JUN_ALIGN_CONSISTENCY: test to see if they give consistent results +ALIGN_JUN_ALIGN_CONSISTENCY: test to see if they give consistent results (run with ALIGN1 JUN_ALIGN1 PLAIN) junction region alignment penalties accessible from the command line: + - JSCORE_MATCH - JSCORE_MISMATCH - JSCORE_GAP_EXTEND @@ -286,6 +302,7 @@ junction region alignment penalties accessible from the command line: lvars fb and fb_n: for example, fb1 is the 15-base sequence of the most frequent feature barcode, and fb1_n is the number of UMIs for that feature barcode + - only usable if there is just one dataset - the file feature_barcode_matrix_top.bin must have been generated @@ -310,6 +327,7 @@ or not they correspond to VDJ cells. This would correspond (at least approximat whitelisted barcodes in the raw data. Only certain other fields are allowed: + 1. feature variables, e.g. CDR3_ab, representing the UMI count 2. gex = total gene expression UMI count 3. type = the cell type @@ -325,8 +343,8 @@ ALL_BCH: same but human readable instead of CSV PRE_EVAL: evaluate sensitivity and specificity before calling print_clonotypes JOIN_BASIC_H=n: use a special join heuristic as follows (for demonstration purposes only) - same heavy chain V, same heavy chain J, - same heavy chain CDR3 length, + same heavy chain V, same heavy chain J, + same heavy chain CDR3 length, n% identity on nucleotides within heavy chain CDR3 This automatically invokes PRE_EVAL because otherwise it would crash. This option is particularly slow because it forces more comparisons in the join step. @@ -338,8 +356,9 @@ EXTERNAL_REF: if you set this to a IMGT reference fasta file, this will compare generated donor reference to it, do some analyses, and exit To generate the current IMGT reference within the 10x codebase, do something like this: + 1. bazel build //:pd //:shimulate //:devpipes_env -2. cellranger/bazel-bin/devpipes_env.sh python cellranger/lib/bin/fetch_imgt_lib.py +2. cellranger/bazel-bin/devpipes_env.sh python cellranger/lib/bin/fetch_imgt_lib.py --genome vdj_IMGT_human --species "Homo sapiens" =================================================================================================== diff --git a/enclone_args/src/load_gex_core.rs b/enclone_args/src/load_gex_core.rs index 05a3c869e..9abd473a2 100644 --- a/enclone_args/src/load_gex_core.rs +++ b/enclone_args/src/load_gex_core.rs @@ -11,22 +11,50 @@ use enclone_core::defs::EncloneControl; use enclone_core::slurp::slurp_h5; use io_utils::{dir_list, open_for_read, open_userfile_for_read, path_exists}; use itertools::Itertools; -use mirror_sparse_matrix::{ - get_code_version_from_file, read_from_file, write_to_file, MirrorSparseMatrix, -}; +use mirror_sparse_matrix::MirrorSparseMatrix; use rayon::prelude::*; use serde_json::Value; use std::{ collections::HashMap, convert::TryInto, fmt::Write, - fs::{read_to_string, remove_file, File}, + fs::read_to_string, io::{BufRead, Read}, time::Instant, }; use string_utils::{parse_csv, TextUtils}; use vector_utils::{unique_sort, VecUtils}; +#[derive(Default)] +struct LoadResult { + f1: Vec, + f2: Vec, + f3: MirrorSparseMatrix, + f4: Option, + f5: Option, + f6: Vec, + f7: HashMap, + f8: HashMap, + f9: HashMap>, + f10: bool, + f11: String, + f12: String, + f13: MirrorSparseMatrix, + f14: Vec, + f15: Vec, + f16: HashMap<(String, String), String>, + f17: HashMap, + f18: String, + f19: u64, + f20: Vec<(String, u32, u32)>, + f21: String, + f23: Vec<(String, u32, u32)>, + f24: MirrorSparseMatrix, + f25: Vec, + f26: u64, + f27: Vec<(String, u32, u32, u32)>, +} + pub fn load_gex( ctl: &mut EncloneControl, gex_features: &mut Vec>, @@ -57,67 +85,9 @@ pub fn load_gex( metrics: &mut Vec, ) -> Result<(), String> { let t = Instant::now(); - let mut results = Vec::<( - usize, - Vec, - Vec, - MirrorSparseMatrix, - Option, - Option, - Vec, - HashMap, - HashMap, - HashMap>, - bool, - String, - String, - MirrorSparseMatrix, - Vec, - Vec, - HashMap<(String, String), String>, - HashMap, - String, - u64, - Vec<(String, u32, u32)>, - String, - (Vec, Vec>), - Vec<(String, u32, u32)>, - MirrorSparseMatrix, - Vec, - u64, - Vec<(String, u32, u32, u32)>, - )>::new(); + let mut results = Vec::<(usize, LoadResult)>::new(); for i in 0..ctl.origin_info.gex_path.len() { - results.push(( - i, - Vec::::new(), - Vec::::new(), - MirrorSparseMatrix::new(), - None, - None, - Vec::::new(), - HashMap::::new(), - HashMap::::new(), - HashMap::>::new(), - false, - String::new(), - String::new(), - MirrorSparseMatrix::new(), - Vec::::new(), - Vec::::new(), - HashMap::<(String, String), String>::new(), - HashMap::::new(), - String::new(), - 0, - Vec::new(), - String::new(), - (Vec::new(), Vec::new()), - Vec::new(), - MirrorSparseMatrix::new(), - Vec::::new(), - 0, - Vec::new(), - )); + results.push((i, LoadResult::default())); } let gex_outs = &ctl.origin_info.gex_path; // Here and in other places, where an error message can be printed in a parallel loop, it @@ -128,9 +98,9 @@ pub fn load_gex( // 1. When running it over a large number of datasets, the observed load average is ~2, so // somehow the parallelism is not working. // 2. We know where the time is spent in the loop, and this is marked below. - results.par_iter_mut().for_each(|r| { - let pathlist = &mut r.15; - let i = r.0; + results.par_iter_mut().for_each(|(i, r)| { + let pathlist = &mut r.f15; + let i = *i; if !gex_outs[i].is_empty() { // First define the path where the GEX files should live, and make sure that the path // exists. @@ -142,7 +112,7 @@ pub fn load_gex( } else if root.ends_with("/outs") { outs = root.before("/outs").to_string(); if !path_exists(&outs) { - r.11 = format!( + r.f11 = format!( "\nThe directory\n{outs}\ndoes not exist. Something must be amiss with \ the arguments to PRE and/or GEX and/or META.\n" ); @@ -167,14 +137,14 @@ pub fn load_gex( } } if h5_path.is_empty() { - r.11 = format!( + r.f11 = format!( "\nThe file raw_feature_bc_matrix.h5 is not in the directory\n{outs}\n\ and neither is the older-named version raw_gene_bc_matrices_h5.h5. Perhaps \ something\nis amiss with the arguments to PRE and/or GEX and/or META.\n" ); return; } - r.12 = h5_path.clone(); + r.f12 = h5_path.clone(); let types_file = format!("{outs}/analysis_csv/celltypes/celltypes.csv"); // Define possible places for the analysis directory. @@ -208,10 +178,9 @@ pub fn load_gex( // Proceed. - let bin_file = format!("{outs}/feature_barcode_matrix.bin"); for f in [pca_file.clone(), cluster_file.clone()].iter() { if !path_exists(f) { - r.11 = format!( + r.f11 = format!( "\nThe file\n{f}\ndoes not exist. \ Perhaps one of your directories is missing some stuff.\n\n\ One possibility is that you ran \"cellranger count\" using only \ @@ -254,7 +223,7 @@ pub fn load_gex( } } if csv.is_empty() { - r.11 = format!( + r.f11 = format!( "\nSomething wrong with GEX or META argument:\ncan't find the file \ metrics_summary.csv or metrics_summary_csv.csv in the directory\n\ {outs}" @@ -262,58 +231,6 @@ pub fn load_gex( return; } - // Determine the state of affairs of the bin file. We end with one of three outcomes: - // - // 1. We're not using the bin file at all. - // 2. We are reading the bin file. - // 3. We are writing the bin file. - - let mut bin_file_state = 1; - if !ctl.gen_opt.force_h5 { - let bin_file_exists = path_exists(&bin_file); - if !bin_file_exists { - if !ctl.gen_opt.h5 { - bin_file_state = 3; - } - } else { - pathlist.push(bin_file.clone()); - // THE FOLLOWING LINE HAS BEEN OBSERVED TO FAIL SPORADICALLY. THIS HAS - // HAPPENED MULTIPLE TIMES. THE FAIL WAS IN - // binary_read_to_ref::(&mut ff, &mut x[0], 11).unwrap(); - // WHERE THE unwrap() FAILED ON - // UnexpectedEof, error: "failed to fill whole buffer". - // - // 2/15/21: this should now be fixed. - - let v = get_code_version_from_file(&bin_file); - if v == 1 { - bin_file_state = 2; - } else { - bin_file_state = 3; - } - } - } - - // If we need to write feature_barcode_matrix.bin, make sure that's possible, before - // spending a lot of time reading other stuff. - - if bin_file_state == 3 { - let f = File::create(&bin_file); - if f.is_err() { - r.11 = format!( - "\nenclone is trying to create the path\n{bin_file}\n\ - but that path cannot be created. This path is for the binary GEX \ - matrix file that enclone can read\n\ - faster than the hdf5 file. Your options are:\n\ - 1. Make that location writable (or fix the path, if it's wrong).\n\ - 2. Find a new location where you can write.\n\ - 3. Don't specify NH5 (if you specified it).\n" - ); - return; - } - remove_file(&bin_file).unwrap(); - } - // Read cell types. if path_exists(&types_file) { @@ -328,14 +245,14 @@ pub fn load_gex( let s = line.unwrap(); let barcode = s.before(","); let cell_type = s.after(","); - r.8.insert(barcode.to_string(), cell_type.to_string()); - r.10 = true; + r.f8.insert(barcode.to_string(), cell_type.to_string()); + r.f10 = true; } } else if ctl.gen_opt.mark_stats || ctl.gen_opt.mark_stats2 || ctl.clono_filt_opt_def.marked_b { - r.11 = format!( + r.f11 = format!( "\nIf you use MARK_STATS or MARK_STATS2 or MARKED_B, celltypes.csv has to \ exist, and this file\n{types_file}\ndoes not exist.\n" ); @@ -354,7 +271,7 @@ pub fn load_gex( for (var, value) in z.iter() { if value.as_f64().is_some() { let value = value.as_f64().unwrap(); - r.17.insert(var.to_string(), value); + r.f17.insert(var.to_string(), value); } } } @@ -388,7 +305,7 @@ pub fn load_gex( )); } } - r.18 = format!("{}\n", lines.iter().format("\n")); + r.f18 = format!("{}\n", lines.iter().format("\n")); } // Read feature metrics file. Note that we do not enforce the requirement of this @@ -432,7 +349,7 @@ pub fn load_gex( || xfields[j] == "num_umis_cells" || xfields[j] == "num_reads_cells" { - r.16.insert( + r.f16.insert( (feature.clone(), xfields[j].clone()), fields[j].clone(), ); @@ -457,7 +374,7 @@ pub fn load_gex( let y = s.after(",").split(',').map(str::force_f64).collect(); // This assert is turned off because in fact there are not always 10 components. // assert_eq!(x.len(), 10); - r.9.insert(barcode.to_string(), y); + r.f9.insert(barcode.to_string(), y); } // Read graph clusters, and also get the cell barcodes from that. @@ -471,8 +388,8 @@ pub fn load_gex( } let s = line.unwrap(); let (barcode, cluster) = (s.before(","), s.after(",").force_usize()); - r.7.insert(barcode.to_string(), cluster); - r.6.push(barcode.to_string()); + r.f7.insert(barcode.to_string(), cluster); + r.f6.push(barcode.to_string()); } // Get the multipliers gene and feature barcode counts. @@ -488,7 +405,7 @@ pub fn load_gex( } } if lines.is_empty() { - r.11 = format!("\nThe file\n{csv}\nis empty.\n"); + r.f11 = format!("\nThe file\n{csv}\nis empty.\n"); return; } let fields = parse_csv(&lines[0]); @@ -514,7 +431,7 @@ pub fn load_gex( || fields.len() < name_field + 1 || fields.len() < value_field + 1 { - r.11 = format!( + r.f11 = format!( "\nSomething appears to be wrong with the file\n{}:\n\ line {} doesn't have enough fields.\n", csv, @@ -529,7 +446,7 @@ pub fn load_gex( rpcx = rpcx.replace(',', ""); rpcx = rpcx.replace('\"', ""); if rpcx.parse::().is_err() { - r.11 = format!( + r.f11 = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the Gene Expression Mean Reads per Cell value isn't an integer.\n" ); @@ -545,7 +462,7 @@ pub fn load_gex( fbrpcx = fbrpcx.replace(',', ""); fbrpcx = fbrpcx.replace('\"', ""); if fbrpcx.parse::().is_err() { - r.11 = format!( + r.f11 = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the Antibody/Antigen Capture Mean Reads per Cell value isn't an integer.\n" ); @@ -555,7 +472,7 @@ pub fn load_gex( } } if rpc.is_none() && fbrpc.is_none() { - r.11 = format!( + r.f11 = format!( "\nGene expression or feature barcode data was expected, however the \ CSV file\n{csv}\n\ does not have values for Gene Expression Mean Reads per Cell or @@ -579,7 +496,7 @@ pub fn load_gex( } } else if line_no == 1 { if rpc_field.is_some() && rpc_field.unwrap() >= fields.len() { - r.11 = format!( + r.f11 = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the second line doesn't have enough fields.\n" ); @@ -589,7 +506,7 @@ pub fn load_gex( rpcx = rpcx.replace(',', ""); rpcx = rpcx.replace('\"', ""); if rpcx.parse::().is_err() { - r.11 = format!( + r.f11 = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the Mean Reads per Cell field isn't an integer.\n" ); @@ -598,7 +515,7 @@ pub fn load_gex( rpc = Some(rpcx.force_usize() as isize); } if fbrpc_field.is_some() && fbrpc_field.unwrap() >= fields.len() { - r.11 = format!( + r.f11 = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the second line doesn't have enough fields.\n" ); @@ -608,7 +525,7 @@ pub fn load_gex( fbrpcx = fbrpcx.replace(',', ""); fbrpcx = fbrpcx.replace('\"', ""); if fbrpcx.parse::().is_err() { - r.11 = format!( + r.f11 = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the Antibody/Antigen: Mean Reads per Cell field isn't an integer.\n" ); @@ -619,7 +536,7 @@ pub fn load_gex( } } if rpc.is_none() && fbrpc.is_none() { - r.11 = format!( + r.f11 = format!( "\nGene expression or feature barcode data was expected, however the \ CSV file\n{csv}\n\ does not have a field \"Mean Reads per Cell\" or \ @@ -639,30 +556,8 @@ pub fn load_gex( const FB_RPC_EXPECTED: f64 = 5_000.0; fb_mult = Some(FB_RPC_EXPECTED / fbrpc as f64); } - r.4 = gene_mult; - r.5 = fb_mult; - - // Read the top feature barcode matrix, by UMIs. - - let top_file = fnx(&outs, "feature_barcode_matrix_top.bin"); - if path_exists(&top_file) { - pathlist.push(top_file.clone()); - read_from_file(&mut r.13, &top_file); - for i in 0..r.13.nrows() { - r.14.push(r.13.row_label(i)); - } - } - - // Read the top feature barcode matrix, by reads. - - let top_file = fnx(&outs, "feature_barcode_matrix_top_reads.bin"); - if path_exists(&top_file) { - pathlist.push(top_file.clone()); - read_from_file(&mut r.24, &top_file); - for i in 0..r.24.nrows() { - r.25.push(r.24.row_label(i)); - } - } + r.f4 = gene_mult; + r.f5 = fb_mult; // Read the total UMIs. @@ -672,7 +567,7 @@ pub fn load_gex( let mut f = open_for_read![&top_file]; let mut bytes = Vec::::new(); f.read_to_end(&mut bytes).unwrap(); - r.19 = u64::from_ne_bytes(bytes.try_into().unwrap()); + r.f19 = u64::from_ne_bytes(bytes.try_into().unwrap()); } // Read the total reads. @@ -683,7 +578,7 @@ pub fn load_gex( let mut f = open_for_read![&top_file]; let mut bytes = Vec::::new(); f.read_to_end(&mut bytes).unwrap(); - r.26 = u64::from_ne_bytes(bytes.try_into().unwrap()); + r.f26 = u64::from_ne_bytes(bytes.try_into().unwrap()); } // Read the barcode-ref-nonref UMI count file. @@ -695,7 +590,7 @@ pub fn load_gex( for line in f.lines() { let s = line.unwrap(); let fields = parse_csv(&s); - r.20.push(( + r.f20.push(( fields[0].to_string(), fields[1].parse::().unwrap(), fields[2].parse::().unwrap(), @@ -712,7 +607,7 @@ pub fn load_gex( for line in f.lines() { let s = line.unwrap(); let fields = parse_csv(&s); - r.23.push(( + r.f23.push(( fields[0].to_string(), fields[1].parse::().unwrap(), fields[2].parse::().unwrap(), @@ -729,7 +624,7 @@ pub fn load_gex( for line in f.lines() { let s = line.unwrap(); let fields = parse_csv(&s); - r.27.push(( + r.f27.push(( fields[0].to_string(), fields[1].parse::().unwrap(), fields[2].parse::().unwrap(), @@ -743,77 +638,46 @@ pub fn load_gex( let fref_file = fnx(&outs, "feature_reference.csv"); if path_exists(&fref_file) { pathlist.push(fref_file.clone()); - r.21 = read_to_string(&fref_file).unwrap(); + r.f21 = read_to_string(&fref_file).unwrap(); } - // Read the binary matrix file if appropriate. - - if bin_file_state == 2 { - read_from_file(&mut r.3, &bin_file); - let (n, k) = (r.3.nrows(), r.3.ncols()); - for i in 0..n { - r.2.push(r.3.row_label(i)); - } - for j in 0..k { - r.1.push(r.3.col_label(j)); - } - - // Otherwise we have to get stuff from the h5 file. - } else { - let mut matrix = Vec::>::new(); - let s = slurp_h5( - &h5_path, - bin_file_state == 3, - &mut r.2, - &mut r.1, - &mut matrix, - ); - if let Err(err) = s { - r.11 = err; - return; - } - if bin_file_state == 3 { - r.3 = MirrorSparseMatrix::build_from_vec(&matrix, &r.2, &r.1); - write_to_file(&r.3, &bin_file); - // Note that if the dataset archive was complete, we would not need to do this. - if ctl.gen_opt.internal_run { - let earth = &ctl.gen_opt.config["earth"]; - if !bin_file.starts_with(earth) { - let bin_file_alt = - format!("{earth}/current{}", bin_file.after("current")); - write_to_file(&r.3, &bin_file_alt); - } - } - } + // Read the feature barcode matrix file. + if let Err(err) = slurp_h5( + &h5_path, + &mut r.f2, + &mut r.f1, + ) { + r.f11 = err; + return; } } - unique_sort(&mut r.6); + unique_sort(&mut r.f6); }); - for r in &results { - ctl.pathlist.extend(r.15.iter().cloned()); + for (_, r) in &results { + ctl.pathlist.extend(r.f15.iter().cloned()); } ctl.perf_stats(&t, "in load_gex main loop"); // Test for error. let t = Instant::now(); - for r in &results { - if !r.11.is_empty() { - return Err(r.11.clone()); + for (_, r) in &results { + if !r.f11.is_empty() { + return Err(r.f11.clone()); } } // Set have_gex and have_fb. - for r in &results { - if r.4.is_some() { + for (_, r) in &results { + if r.f4.is_some() { *have_gex = true; } - if r.5.is_some() { + if r.f5.is_some() { *have_fb = true; } } - h5_paths.extend(results.iter().map(|r| r.12.clone())); + h5_paths.extend(results.iter().map(|(_, r)| r.f12.clone())); // Add some metrics. @@ -835,90 +699,54 @@ pub fn load_gex( let metric_name = x.0.to_string(); let metric_display_name = x.1.to_string(); let mut have = false; - for result in &results { - if result.17.contains_key(&metric_name) { + for (_, result) in &results { + if result.f17.contains_key(&metric_name) { have = true; } } if have { - for result in results.iter_mut() { + for (_, result) in results.iter_mut() { let mut value = String::new(); - if result.17.contains_key(&metric_name) { - value = format!("{:.3}", result.17[&metric_name]); + if result.f17.contains_key(&metric_name) { + value = format!("{:.3}", result.f17[&metric_name]); } - writeln!(result.18, "{metric_display_name},{value}").unwrap(); + writeln!(result.f18, "{metric_display_name},{value}").unwrap(); } } } - // Save results. This avoids cloning, which saves a lot of time. - - let n = results.len(); - for ( - _i, - ( - _x0, - x1, - x2, - x3, - x4, - x5, - x6, - x7, - x8, - x9, - x10, - _x11, - _x12, - x13, - x14, - _x15, - x16, - x17, - x18, - x19, - x20, - x21, - _x22, - x23, - x24, - x25, - x26, - x27, - ), - ) in results.into_iter().take(n).enumerate() - { - gex_features.push(x1); - gex_barcodes.push(x2); - gex_matrices.push(x3); - fb_top_matrices.push(x13); - fb_top_barcodes.push(x14); + for (_, r) in results.into_iter() { + gex_features.push(r.f1); + gex_barcodes.push(r.f2); + gex_matrices.push(r.f3); + fb_top_matrices.push(r.f13); + fb_top_barcodes.push(r.f14); let mut gex_mult = 1.0; - if let Some(x4) = x4 { - gex_mult = x4; + if let Some(f4) = r.f4 { + gex_mult = f4; } gex_mults.push(gex_mult); let mut fb_mult = 1.0; - if let Some(x5) = x5 { - fb_mult = x5; + if let Some(f5) = r.f5 { + fb_mult = f5; } fb_mults.push(fb_mult); - gex_cell_barcodes.push(x6); - cluster.push(x7); - cell_type.push(x8); - pca.push(x9); - cell_type_specified.push(x10); - feature_metrics.push(x16); - json_metrics.push(x17); - metrics.push(x18); - fb_total_umis.push(x19); - fb_brn.push(x20); - feature_refs.push(x21); - fb_brnr.push(x23); - fb_top_reads_matrices.push(x24); - fb_top_reads_barcodes.push(x25); - fb_total_reads.push(x26); - fb_bdcs.push(x27); + gex_cell_barcodes.push(r.f6); + cluster.push(r.f7); + cell_type.push(r.f8); + pca.push(r.f9); + cell_type_specified.push(r.f10); + feature_metrics.push(r.f16); + json_metrics.push(r.f17); + metrics.push(r.f18); + fb_total_umis.push(r.f19); + fb_brn.push(r.f20); + feature_refs.push(r.f21); + fb_brnr.push(r.f23); + fb_top_reads_matrices.push(r.f24); + fb_top_reads_barcodes.push(r.f25); + fb_total_reads.push(r.f26); + fb_bdcs.push(r.f27); } // Done. diff --git a/enclone_args/src/process_special_arg1.rs b/enclone_args/src/process_special_arg1.rs index a4e9eba98..c7cf2ce8f 100644 --- a/enclone_args/src/process_special_arg1.rs +++ b/enclone_args/src/process_special_arg1.rs @@ -39,10 +39,6 @@ pub fn process_special_arg1( return Err("\nCurrently the only allowed value for PG_DIST is MFL.\n".to_string()); } ctl.gen_opt.peer_group_dist = dist.to_string(); - } else if is_simple_arg(arg, "H5")? { - ctl.gen_opt.force_h5 = true; - } else if is_simple_arg(arg, "NH5")? { - ctl.gen_opt.force_h5 = false; } else if arg == "LEGEND" { ctl.plot_opt.use_legend = true; } else if arg == "MAX_HEAVIES=1" { diff --git a/enclone_core/src/defs.rs b/enclone_core/src/defs.rs index 1ab19cd0a..7e94ae0a5 100644 --- a/enclone_core/src/defs.rs +++ b/enclone_core/src/defs.rs @@ -172,7 +172,6 @@ pub struct GeneralOpt { pub accept_inconsistent: bool, // TEMPORARY! pub current_ref: bool, // TEMPORARY! pub internal_run: bool, - pub force_h5: bool, pub full_counts: bool, pub html: bool, pub html_title: String, diff --git a/enclone_core/src/slurp.rs b/enclone_core/src/slurp.rs index 2673bd257..14ce6abfb 100644 --- a/enclone_core/src/slurp.rs +++ b/enclone_core/src/slurp.rs @@ -7,10 +7,8 @@ use itertools::Itertools; pub fn slurp_h5( h5_path: &str, - take_matrix: bool, barcodes: &mut Vec, features: &mut Vec, - matrix: &mut Vec>, ) -> Result<(), String> { // Read barcodes from the h5 file. @@ -42,22 +40,5 @@ pub fn slurp_h5( feature_ids[i], feature_names[i], feature_types[i] )); } - - // If appropriate, construct the binary matrix file from the h5 file. - - if take_matrix { - let data_loc = h.dataset("matrix/data").unwrap(); - let data: Vec = data_loc.as_reader().read_raw().unwrap(); - let ind_loc = h.dataset("matrix/indices").unwrap(); - let ind: Vec = ind_loc.as_reader().read_raw().unwrap(); - let ind_ptr_loc = h.dataset("matrix/indptr").unwrap(); - let ind_ptr: Vec = ind_ptr_loc.as_reader().read_raw().unwrap(); - matrix.resize(barcodes.len(), Vec::new()); - for i in 0..matrix.len() { - for j in ind_ptr[i]..ind_ptr[i + 1] { - matrix[i].push((ind[j as usize] as i32, data[j as usize] as i32)); - } - } - } Ok(()) } diff --git a/mirror_sparse_matrix/src/lib.rs b/mirror_sparse_matrix/src/lib.rs index 03c470c5c..a9412ecb5 100644 --- a/mirror_sparse_matrix/src/lib.rs +++ b/mirror_sparse_matrix/src/lib.rs @@ -55,7 +55,6 @@ // All functions after item 4 above will assert strangely or return garbage. It would be // better to first call get_code_version_from_file. -use binary_vec_io::{binary_read_to_ref, binary_read_vec, binary_write_vec}; use std::cmp::max; #[derive(Clone)] @@ -63,38 +62,6 @@ pub struct MirrorSparseMatrix { x: Vec, } -pub fn get_code_version_from_file(f: &str) -> u32 { - assert_eq!(std::mem::size_of::(), 8); // for the usize at the beginning of the file - let mut ff = std::fs::File::open(f).unwrap(); - let mut x = [0_u32; 11]; - binary_read_to_ref::(&mut ff, &mut x[0], 11).unwrap(); - x[10] -} - -pub fn read_from_file(s: &mut MirrorSparseMatrix, f: &str) { - let mut ff = std::fs::File::open(f).unwrap(); - binary_read_vec::(&mut ff, &mut s.x).unwrap(); - if s.code_version() != 0 && s.code_version() != 1 { - panic!( - "\nMirrorSparseMatrix: code_version has to be 0 or 1, but it is {}.\n", - s.code_version() - ); - } - if s.storage_version() != 0 && s.storage_version() != 1 { - panic!( - "\nMirrorSparseMatrix: storage_version has to be 0 or 1, but it is {}.\n", - s.storage_version() - ); - } -} - -pub fn write_to_file(s: &MirrorSparseMatrix, f: &str) { - assert!(s.code_version() > 0); - let mut ff = - std::fs::File::create(f).unwrap_or_else(|_| panic!("Failed to create file {}.", f)); - binary_write_vec::(&mut ff, &s.x).unwrap(); -} - fn get_u8_at_pos(v: &[u8], pos: usize) -> u8 { v[pos] } From b5ca09453e5ffe87fa0971733e668d46ded359f5 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 28 Feb 2024 16:30:33 -0800 Subject: [PATCH 2/7] Keep pulling on the dead thread. --- enclone_args/src/load_gex.rs | 10 - enclone_args/src/load_gex_core.rs | 189 ++++++++--------- enclone_args/src/proc_args_check.rs | 9 +- enclone_core/src/defs.rs | 3 - enclone_print/src/print_utils1.rs | 13 +- enclone_print/src/print_utils2.rs | 65 +++--- enclone_print/src/print_utils4.rs | 73 ++----- enclone_print/src/proc_lvar_auto.rs | 314 +++++++++------------------- 8 files changed, 238 insertions(+), 438 deletions(-) diff --git a/enclone_args/src/load_gex.rs b/enclone_args/src/load_gex.rs index d66f51022..ab5596d68 100644 --- a/enclone_args/src/load_gex.rs +++ b/enclone_args/src/load_gex.rs @@ -20,11 +20,8 @@ use vector_utils::{bin_position, unique_sort}; pub fn get_gex_info(ctl: &mut EncloneControl) -> Result { let mut gex_features = Vec::>::new(); let mut gex_barcodes = Vec::>::new(); - let mut gex_matrices = Vec::::new(); let mut fb_top_barcodes = Vec::>::new(); - let mut fb_top_matrices = Vec::::new(); let mut fb_top_reads_barcodes = Vec::>::new(); - let mut fb_top_reads_matrices = Vec::::new(); let mut fb_total_umis = Vec::::new(); let mut fb_total_reads = Vec::::new(); let mut fb_brn = Vec::>::new(); @@ -48,11 +45,7 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result { ctl, &mut gex_features, &mut gex_barcodes, - &mut gex_matrices, - &mut fb_top_barcodes, - &mut fb_top_matrices, &mut fb_top_reads_barcodes, - &mut fb_top_reads_matrices, &mut fb_total_umis, &mut fb_total_reads, &mut fb_brn, @@ -163,11 +156,8 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result { Ok(GexInfo { gex_features, gex_barcodes, - gex_matrices, fb_top_barcodes, - fb_top_matrices, fb_top_reads_barcodes, - fb_top_reads_matrices, fb_total_umis, fb_total_reads, fb_brn, diff --git a/enclone_args/src/load_gex_core.rs b/enclone_args/src/load_gex_core.rs index 9abd473a2..8fbb9a48c 100644 --- a/enclone_args/src/load_gex_core.rs +++ b/enclone_args/src/load_gex_core.rs @@ -11,7 +11,6 @@ use enclone_core::defs::EncloneControl; use enclone_core::slurp::slurp_h5; use io_utils::{dir_list, open_for_read, open_userfile_for_read, path_exists}; use itertools::Itertools; -use mirror_sparse_matrix::MirrorSparseMatrix; use rayon::prelude::*; use serde_json::Value; use std::{ @@ -27,43 +26,34 @@ use vector_utils::{unique_sort, VecUtils}; #[derive(Default)] struct LoadResult { - f1: Vec, - f2: Vec, - f3: MirrorSparseMatrix, - f4: Option, - f5: Option, - f6: Vec, - f7: HashMap, - f8: HashMap, - f9: HashMap>, - f10: bool, - f11: String, - f12: String, - f13: MirrorSparseMatrix, - f14: Vec, + gex_features: Vec, + gex_barcodes: Vec, + gex_mult: Option, + fb_mult: Option, + gex_cell_barcodes: Vec, + cluster: HashMap, + cell_type: HashMap, + pca: HashMap>, + cell_type_specified: bool, + error: String, + h5_path: String, f15: Vec, - f16: HashMap<(String, String), String>, - f17: HashMap, - f18: String, - f19: u64, - f20: Vec<(String, u32, u32)>, - f21: String, - f23: Vec<(String, u32, u32)>, - f24: MirrorSparseMatrix, - f25: Vec, - f26: u64, - f27: Vec<(String, u32, u32, u32)>, + feature_metrics: HashMap<(String, String), String>, + json_metrics: HashMap, + metrics: String, + fb_total_umis: u64, + fb_brn: Vec<(String, u32, u32)>, + feature_refs: String, + fb_brnr: Vec<(String, u32, u32)>, + fb_total_reads: u64, + fb_bdcs: Vec<(String, u32, u32, u32)>, } pub fn load_gex( ctl: &mut EncloneControl, gex_features: &mut Vec>, gex_barcodes: &mut Vec>, - gex_matrices: &mut Vec, - fb_top_barcodes: &mut Vec>, - fb_top_matrices: &mut Vec, fb_top_reads_barcodes: &mut Vec>, - fb_top_reads_matrices: &mut Vec, fb_total_umis: &mut Vec, fb_total_reads: &mut Vec, fb_brn: &mut Vec>, @@ -112,7 +102,7 @@ pub fn load_gex( } else if root.ends_with("/outs") { outs = root.before("/outs").to_string(); if !path_exists(&outs) { - r.f11 = format!( + r.error = format!( "\nThe directory\n{outs}\ndoes not exist. Something must be amiss with \ the arguments to PRE and/or GEX and/or META.\n" ); @@ -137,14 +127,14 @@ pub fn load_gex( } } if h5_path.is_empty() { - r.f11 = format!( + r.error = format!( "\nThe file raw_feature_bc_matrix.h5 is not in the directory\n{outs}\n\ and neither is the older-named version raw_gene_bc_matrices_h5.h5. Perhaps \ something\nis amiss with the arguments to PRE and/or GEX and/or META.\n" ); return; } - r.f12 = h5_path.clone(); + r.h5_path = h5_path.clone(); let types_file = format!("{outs}/analysis_csv/celltypes/celltypes.csv"); // Define possible places for the analysis directory. @@ -180,7 +170,7 @@ pub fn load_gex( for f in [pca_file.clone(), cluster_file.clone()].iter() { if !path_exists(f) { - r.f11 = format!( + r.error = format!( "\nThe file\n{f}\ndoes not exist. \ Perhaps one of your directories is missing some stuff.\n\n\ One possibility is that you ran \"cellranger count\" using only \ @@ -223,7 +213,7 @@ pub fn load_gex( } } if csv.is_empty() { - r.f11 = format!( + r.error = format!( "\nSomething wrong with GEX or META argument:\ncan't find the file \ metrics_summary.csv or metrics_summary_csv.csv in the directory\n\ {outs}" @@ -245,14 +235,14 @@ pub fn load_gex( let s = line.unwrap(); let barcode = s.before(","); let cell_type = s.after(","); - r.f8.insert(barcode.to_string(), cell_type.to_string()); - r.f10 = true; + r.cell_type.insert(barcode.to_string(), cell_type.to_string()); + r.cell_type_specified = true; } } else if ctl.gen_opt.mark_stats || ctl.gen_opt.mark_stats2 || ctl.clono_filt_opt_def.marked_b { - r.f11 = format!( + r.error = format!( "\nIf you use MARK_STATS or MARK_STATS2 or MARKED_B, celltypes.csv has to \ exist, and this file\n{types_file}\ndoes not exist.\n" ); @@ -271,7 +261,7 @@ pub fn load_gex( for (var, value) in z.iter() { if value.as_f64().is_some() { let value = value.as_f64().unwrap(); - r.f17.insert(var.to_string(), value); + r.json_metrics.insert(var.to_string(), value); } } } @@ -305,7 +295,7 @@ pub fn load_gex( )); } } - r.f18 = format!("{}\n", lines.iter().format("\n")); + r.metrics = format!("{}\n", lines.iter().format("\n")); } // Read feature metrics file. Note that we do not enforce the requirement of this @@ -349,7 +339,7 @@ pub fn load_gex( || xfields[j] == "num_umis_cells" || xfields[j] == "num_reads_cells" { - r.f16.insert( + r.feature_metrics.insert( (feature.clone(), xfields[j].clone()), fields[j].clone(), ); @@ -374,7 +364,7 @@ pub fn load_gex( let y = s.after(",").split(',').map(str::force_f64).collect(); // This assert is turned off because in fact there are not always 10 components. // assert_eq!(x.len(), 10); - r.f9.insert(barcode.to_string(), y); + r.pca.insert(barcode.to_string(), y); } // Read graph clusters, and also get the cell barcodes from that. @@ -388,8 +378,8 @@ pub fn load_gex( } let s = line.unwrap(); let (barcode, cluster) = (s.before(","), s.after(",").force_usize()); - r.f7.insert(barcode.to_string(), cluster); - r.f6.push(barcode.to_string()); + r.cluster.insert(barcode.to_string(), cluster); + r.gex_cell_barcodes.push(barcode.to_string()); } // Get the multipliers gene and feature barcode counts. @@ -405,7 +395,7 @@ pub fn load_gex( } } if lines.is_empty() { - r.f11 = format!("\nThe file\n{csv}\nis empty.\n"); + r.error = format!("\nThe file\n{csv}\nis empty.\n"); return; } let fields = parse_csv(&lines[0]); @@ -431,7 +421,7 @@ pub fn load_gex( || fields.len() < name_field + 1 || fields.len() < value_field + 1 { - r.f11 = format!( + r.error = format!( "\nSomething appears to be wrong with the file\n{}:\n\ line {} doesn't have enough fields.\n", csv, @@ -446,7 +436,7 @@ pub fn load_gex( rpcx = rpcx.replace(',', ""); rpcx = rpcx.replace('\"', ""); if rpcx.parse::().is_err() { - r.f11 = format!( + r.error = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the Gene Expression Mean Reads per Cell value isn't an integer.\n" ); @@ -462,7 +452,7 @@ pub fn load_gex( fbrpcx = fbrpcx.replace(',', ""); fbrpcx = fbrpcx.replace('\"', ""); if fbrpcx.parse::().is_err() { - r.f11 = format!( + r.error = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the Antibody/Antigen Capture Mean Reads per Cell value isn't an integer.\n" ); @@ -472,7 +462,7 @@ pub fn load_gex( } } if rpc.is_none() && fbrpc.is_none() { - r.f11 = format!( + r.error = format!( "\nGene expression or feature barcode data was expected, however the \ CSV file\n{csv}\n\ does not have values for Gene Expression Mean Reads per Cell or @@ -496,7 +486,7 @@ pub fn load_gex( } } else if line_no == 1 { if rpc_field.is_some() && rpc_field.unwrap() >= fields.len() { - r.f11 = format!( + r.error = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the second line doesn't have enough fields.\n" ); @@ -506,7 +496,7 @@ pub fn load_gex( rpcx = rpcx.replace(',', ""); rpcx = rpcx.replace('\"', ""); if rpcx.parse::().is_err() { - r.f11 = format!( + r.error = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the Mean Reads per Cell field isn't an integer.\n" ); @@ -515,7 +505,7 @@ pub fn load_gex( rpc = Some(rpcx.force_usize() as isize); } if fbrpc_field.is_some() && fbrpc_field.unwrap() >= fields.len() { - r.f11 = format!( + r.error = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the second line doesn't have enough fields.\n" ); @@ -525,7 +515,7 @@ pub fn load_gex( fbrpcx = fbrpcx.replace(',', ""); fbrpcx = fbrpcx.replace('\"', ""); if fbrpcx.parse::().is_err() { - r.f11 = format!( + r.error = format!( "\nSomething appears to be wrong with the file\n{csv}:\n\ the Antibody/Antigen: Mean Reads per Cell field isn't an integer.\n" ); @@ -536,7 +526,7 @@ pub fn load_gex( } } if rpc.is_none() && fbrpc.is_none() { - r.f11 = format!( + r.error = format!( "\nGene expression or feature barcode data was expected, however the \ CSV file\n{csv}\n\ does not have a field \"Mean Reads per Cell\" or \ @@ -556,8 +546,8 @@ pub fn load_gex( const FB_RPC_EXPECTED: f64 = 5_000.0; fb_mult = Some(FB_RPC_EXPECTED / fbrpc as f64); } - r.f4 = gene_mult; - r.f5 = fb_mult; + r.gex_mult = gene_mult; + r.fb_mult = fb_mult; // Read the total UMIs. @@ -567,7 +557,7 @@ pub fn load_gex( let mut f = open_for_read![&top_file]; let mut bytes = Vec::::new(); f.read_to_end(&mut bytes).unwrap(); - r.f19 = u64::from_ne_bytes(bytes.try_into().unwrap()); + r.fb_total_umis = u64::from_ne_bytes(bytes.try_into().unwrap()); } // Read the total reads. @@ -578,7 +568,7 @@ pub fn load_gex( let mut f = open_for_read![&top_file]; let mut bytes = Vec::::new(); f.read_to_end(&mut bytes).unwrap(); - r.f26 = u64::from_ne_bytes(bytes.try_into().unwrap()); + r.fb_total_reads = u64::from_ne_bytes(bytes.try_into().unwrap()); } // Read the barcode-ref-nonref UMI count file. @@ -590,7 +580,7 @@ pub fn load_gex( for line in f.lines() { let s = line.unwrap(); let fields = parse_csv(&s); - r.f20.push(( + r.fb_brn.push(( fields[0].to_string(), fields[1].parse::().unwrap(), fields[2].parse::().unwrap(), @@ -607,7 +597,7 @@ pub fn load_gex( for line in f.lines() { let s = line.unwrap(); let fields = parse_csv(&s); - r.f23.push(( + r.fb_brnr.push(( fields[0].to_string(), fields[1].parse::().unwrap(), fields[2].parse::().unwrap(), @@ -624,7 +614,7 @@ pub fn load_gex( for line in f.lines() { let s = line.unwrap(); let fields = parse_csv(&s); - r.f27.push(( + r.fb_bdcs.push(( fields[0].to_string(), fields[1].parse::().unwrap(), fields[2].parse::().unwrap(), @@ -638,20 +628,20 @@ pub fn load_gex( let fref_file = fnx(&outs, "feature_reference.csv"); if path_exists(&fref_file) { pathlist.push(fref_file.clone()); - r.f21 = read_to_string(&fref_file).unwrap(); + r.feature_refs = read_to_string(&fref_file).unwrap(); } // Read the feature barcode matrix file. if let Err(err) = slurp_h5( &h5_path, - &mut r.f2, - &mut r.f1, + &mut r.gex_barcodes, + &mut r.gex_features, ) { - r.f11 = err; + r.error = err; return; } } - unique_sort(&mut r.f6); + unique_sort(&mut r.gex_cell_barcodes); }); for (_, r) in &results { ctl.pathlist.extend(r.f15.iter().cloned()); @@ -662,22 +652,22 @@ pub fn load_gex( let t = Instant::now(); for (_, r) in &results { - if !r.f11.is_empty() { - return Err(r.f11.clone()); + if !r.error.is_empty() { + return Err(r.error.clone()); } } // Set have_gex and have_fb. for (_, r) in &results { - if r.f4.is_some() { + if r.gex_mult.is_some() { *have_gex = true; } - if r.f5.is_some() { + if r.fb_mult.is_some() { *have_fb = true; } } - h5_paths.extend(results.iter().map(|(_, r)| r.f12.clone())); + h5_paths.extend(results.iter().map(|(_, r)| r.h5_path.clone())); // Add some metrics. @@ -700,53 +690,40 @@ pub fn load_gex( let metric_display_name = x.1.to_string(); let mut have = false; for (_, result) in &results { - if result.f17.contains_key(&metric_name) { + if result.json_metrics.contains_key(&metric_name) { have = true; } } if have { for (_, result) in results.iter_mut() { let mut value = String::new(); - if result.f17.contains_key(&metric_name) { - value = format!("{:.3}", result.f17[&metric_name]); + if result.json_metrics.contains_key(&metric_name) { + value = format!("{:.3}", result.json_metrics[&metric_name]); } - writeln!(result.f18, "{metric_display_name},{value}").unwrap(); + writeln!(result.metrics, "{metric_display_name},{value}").unwrap(); } } } for (_, r) in results.into_iter() { - gex_features.push(r.f1); - gex_barcodes.push(r.f2); - gex_matrices.push(r.f3); - fb_top_matrices.push(r.f13); - fb_top_barcodes.push(r.f14); - let mut gex_mult = 1.0; - if let Some(f4) = r.f4 { - gex_mult = f4; - } - gex_mults.push(gex_mult); - let mut fb_mult = 1.0; - if let Some(f5) = r.f5 { - fb_mult = f5; - } - fb_mults.push(fb_mult); - gex_cell_barcodes.push(r.f6); - cluster.push(r.f7); - cell_type.push(r.f8); - pca.push(r.f9); - cell_type_specified.push(r.f10); - feature_metrics.push(r.f16); - json_metrics.push(r.f17); - metrics.push(r.f18); - fb_total_umis.push(r.f19); - fb_brn.push(r.f20); - feature_refs.push(r.f21); - fb_brnr.push(r.f23); - fb_top_reads_matrices.push(r.f24); - fb_top_reads_barcodes.push(r.f25); - fb_total_reads.push(r.f26); - fb_bdcs.push(r.f27); + gex_features.push(r.gex_features); + gex_barcodes.push(r.gex_barcodes); + gex_mults.push(r.gex_mult.unwrap_or(1.0)); + fb_mults.push(r.fb_mult.unwrap_or(1.0)); + gex_cell_barcodes.push(r.gex_cell_barcodes); + cluster.push(r.cluster); + cell_type.push(r.cell_type); + pca.push(r.pca); + cell_type_specified.push(r.cell_type_specified); + feature_metrics.push(r.feature_metrics); + json_metrics.push(r.json_metrics); + metrics.push(r.metrics); + fb_total_umis.push(r.fb_total_umis); + fb_brn.push(r.fb_brn); + feature_refs.push(r.feature_refs); + fb_brnr.push(r.fb_brnr); + fb_total_reads.push(r.fb_total_reads); + fb_bdcs.push(r.fb_bdcs); } // Done. diff --git a/enclone_args/src/proc_args_check.rs b/enclone_args/src/proc_args_check.rs index f18b319fb..9f8465e24 100644 --- a/enclone_args/src/proc_args_check.rs +++ b/enclone_args/src/proc_args_check.rs @@ -637,14 +637,7 @@ pub fn check_one_lvar( .to_string(), ); } - if !gex_info.fb_top_matrices[0].initialized() { - return Err( - "\nThe variables fb and fb_n can only be used if the file \ - feature_barcode_matrix_top.bin was generated.\n" - .to_string(), - ); - } - return Ok(true); + return Err("\nThe variables fb and fb_n can no longer be used.\n".to_string()); } } diff --git a/enclone_core/src/defs.rs b/enclone_core/src/defs.rs index 7e94ae0a5..7ce181364 100644 --- a/enclone_core/src/defs.rs +++ b/enclone_core/src/defs.rs @@ -829,10 +829,7 @@ pub struct CloneInfo { pub struct GexInfo { pub gex_features: Vec>, pub gex_barcodes: Vec>, - pub gex_matrices: Vec, - pub fb_top_matrices: Vec, pub fb_top_barcodes: Vec>, - pub fb_top_reads_matrices: Vec, pub fb_top_reads_barcodes: Vec>, pub fb_total_umis: Vec, pub fb_total_reads: Vec, diff --git a/enclone_print/src/print_utils1.rs b/enclone_print/src/print_utils1.rs index ffc90a84b..57accb288 100644 --- a/enclone_print/src/print_utils1.rs +++ b/enclone_print/src/print_utils1.rs @@ -861,16 +861,13 @@ pub fn get_gex_matrix_entry( y: &str, ) -> f64 { let mut raw_count = 0 as f64; - if gex_info.gex_matrices[li].initialized() { - raw_count = gex_info.gex_matrices[li].value(p, fid) as f64; - } else { - for (&da, &ia) in d_all[l].iter().zip(ind_all[l].iter()) { - if ia == fid as u32 { - raw_count = da as f64; - break; - } + for (&da, &ia) in d_all[l].iter().zip(ind_all[l].iter()) { + if ia == fid as u32 { + raw_count = da as f64; + break; } } + let mult = if y.ends_with("_g") { gex_info.gex_mults[li] } else { diff --git a/enclone_print/src/print_utils2.rs b/enclone_print/src/print_utils2.rs index 6dd0960c4..1541c3fdc 100644 --- a/enclone_print/src/print_utils2.rs +++ b/enclone_print/src/print_utils2.rs @@ -216,46 +216,37 @@ pub fn row_fill( let p = bin_position(&gex_info.gex_barcodes[li], &bc); if p >= 0 { let mut raw_count = 0; - if gex_info.gex_matrices[li].initialized() { - let row = gex_info.gex_matrices[li].row(p as usize); - for r in row { - let f = r.0; - let n = r.1; - if gex_info.is_gex[li][f] { - raw_count += n; - } - } + + let z1 = gex_info.h5_indptr[li][p as usize] as usize; + let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? + let d: Vec; + let ind: Vec; + if ctl.gen_opt.h5_pre { + d = h5_data[li].1[z1..z2].to_vec(); + ind = h5_data[li].2[z1..z2].to_vec(); } else { - let z1 = gex_info.h5_indptr[li][p as usize] as usize; - let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? - let d: Vec; - let ind: Vec; - if ctl.gen_opt.h5_pre { - d = h5_data[li].1[z1..z2].to_vec(); - ind = h5_data[li].2[z1..z2].to_vec(); - } else { - d = d_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - ind = ind_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - } - for j in 0..d.len() { - if gex_info.is_gex[li][ind[j] as usize] { - let n = d[j] as usize; - raw_count += n; - } + d = d_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + ind = ind_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + } + for j in 0..d.len() { + if gex_info.is_gex[li][ind[j] as usize] { + let n = d[j] as usize; + raw_count += n; } - d_all[l] = d; - ind_all[l] = ind; } + d_all[l] = d; + ind_all[l] = ind; + if !ctl.gen_opt.full_counts { count = (raw_count as f64 * gex_info.gex_mults[li]).round() as usize; fcount = raw_count as f64 * gex_info.gex_mults[li]; diff --git a/enclone_print/src/print_utils4.rs b/enclone_print/src/print_utils4.rs index d1aece781..9e15821a9 100644 --- a/enclone_print/src/print_utils4.rs +++ b/enclone_print/src/print_utils4.rs @@ -27,16 +27,14 @@ pub fn get_gex_matrix_entry( y: &str, ) -> f64 { let mut raw_count = 0 as f64; - if gex_info.gex_matrices[li].initialized() { - raw_count = gex_info.gex_matrices[li].value(p, fid) as f64; - } else { - for j in 0..d_all[l].len() { - if ind_all[l][j] == fid as u32 { - raw_count = d_all[l][j] as f64; - break; - } + + for j in 0..d_all[l].len() { + if ind_all[l][j] == fid as u32 { + raw_count = d_all[l][j] as f64; + break; } } + let mult = if y.ends_with("_g") { gex_info.gex_mults[li] } else { @@ -663,41 +661,24 @@ pub fn compute_bu( let p = bin_position(&gex_info.gex_barcodes[li], bc); if p >= 0 { let mut raw_count = 0; - if gex_info.gex_matrices[li].initialized() { - let row = gex_info.gex_matrices[li].row(p as usize); - for (f, n) in row { - if gex_info.is_gex[li][f] { - raw_count += n; - } - } - } else { - let l = bcl.2; - for j in 0..d_all[l].len() { - if gex_info.is_gex[li][ind_all[l][j] as usize] { - raw_count += d_all[l][j] as usize; - } + + let l = bcl.2; + for j in 0..d_all[l].len() { + if gex_info.is_gex[li][ind_all[l][j] as usize] { + raw_count += d_all[l][j] as usize; } } + gex_count = raw_count; } let mut entropy = 0.0; if p >= 0 { - if gex_info.gex_matrices[li].initialized() { - let row = gex_info.gex_matrices[li].row(p as usize); - for (f, n) in row { - if gex_info.is_gex[li][f] { - let q = n as f64 / gex_count as f64; - entropy -= q * q.log2(); - } - } - } else { - let l = bcl.2; - for j in 0..d_all[l].len() { - if gex_info.is_gex[li][ind_all[l][j] as usize] { - let n = d_all[l][j] as usize; - let q = n as f64 / gex_count as f64; - entropy -= q * q.log2(); - } + let l = bcl.2; + for j in 0..d_all[l].len() { + if gex_info.is_gex[li][ind_all[l][j] as usize] { + let n = d_all[l][j] as usize; + let q = n as f64 / gex_count as f64; + entropy -= q * q.log2(); } } } @@ -709,21 +690,13 @@ pub fn compute_bu( let p = bin_position(&gex_info.gex_barcodes[li], bc); if p >= 0 { let mut raw_count = 0 as f64; - if gex_info.gex_matrices[li].initialized() { - let row = gex_info.gex_matrices[li].row(p as usize); - for (f, n) in row { - if gex_info.is_gex[li][f] { - raw_count += n as f64; - } - } - } else { - let l = bcl.2; - for j in 0..d_all[l].len() { - if gex_info.is_gex[li][ind_all[l][j] as usize] { - raw_count += d_all[l][j] as f64; - } + let l = bcl.2; + for j in 0..d_all[l].len() { + if gex_info.is_gex[li][ind_all[l][j] as usize] { + raw_count += d_all[l][j] as f64; } } + if !ctl.gen_opt.full_counts { gex_count = raw_count * gex_info.gex_mults[li]; } else { diff --git a/enclone_print/src/proc_lvar_auto.rs b/enclone_print/src/proc_lvar_auto.rs index deb734757..f4edd177c 100644 --- a/enclone_print/src/proc_lvar_auto.rs +++ b/enclone_print/src/proc_lvar_auto.rs @@ -787,41 +787,33 @@ pub fn proc_lvar_auto( let p = bin_position(&gex_info.gex_barcodes[li], &bc); if p >= 0 { let mut raw_count = 0; - if gex_info.gex_matrices[li].initialized() { - let row = gex_info.gex_matrices[li].row(p as usize); - for (f, n) in row { - if gex_info.is_gex[li][f] { - raw_count += n; - } - } + let z1 = gex_info.h5_indptr[li][p as usize] as usize; + let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? + let d: Vec; + let ind: Vec; + if ctl.gen_opt.h5_pre { + d = h5_data[li].1[z1..z2].to_vec(); + ind = h5_data[li].2[z1..z2].to_vec(); } else { - let z1 = gex_info.h5_indptr[li][p as usize] as usize; - let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? - let d: Vec; - let ind: Vec; - if ctl.gen_opt.h5_pre { - d = h5_data[li].1[z1..z2].to_vec(); - ind = h5_data[li].2[z1..z2].to_vec(); - } else { - d = d_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - ind = ind_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - } - for j in 0..d.len() { - if gex_info.is_gex[li][ind[j] as usize] { - raw_count += d[j] as usize; - } + d = d_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + ind = ind_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + } + for j in 0..d.len() { + if gex_info.is_gex[li][ind[j] as usize] { + raw_count += d[j] as usize; } } + total_counts.push(raw_count); } } @@ -834,42 +826,32 @@ pub fn proc_lvar_auto( let mut entropy = 0.0; let p = bin_position(&gex_info.gex_barcodes[li], &bc.to_string()); if p >= 0 { - if gex_info.gex_matrices[li].initialized() { - let row = gex_info.gex_matrices[li].row(p as usize); - for (f, n) in row { - if gex_info.is_gex[li][f] { - let q = n as f64 / tc as f64; - entropy -= q * q.log2(); - } - } + let z1 = gex_info.h5_indptr[li][p as usize] as usize; + let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? + let d: Vec; + let ind: Vec; + if ctl.gen_opt.h5_pre { + d = h5_data[li].1[z1..z2].to_vec(); + ind = h5_data[li].2[z1..z2].to_vec(); } else { - let z1 = gex_info.h5_indptr[li][p as usize] as usize; - let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? - let d: Vec; - let ind: Vec; - if ctl.gen_opt.h5_pre { - d = h5_data[li].1[z1..z2].to_vec(); - ind = h5_data[li].2[z1..z2].to_vec(); - } else { - d = d_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - ind = ind_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - } - for j in 0..d.len() { - if gex_info.is_gex[li][ind[j] as usize] { - let n = d[j] as usize; - let q = n as f64 / tc as f64; - entropy -= q * q.log2(); - } + d = d_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + ind = ind_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + } + for j in 0..d.len() { + if gex_info.is_gex[li][ind[j] as usize] { + let n = d[j] as usize; + let q = n as f64 / tc as f64; + entropy -= q * q.log2(); } } } @@ -897,41 +879,33 @@ pub fn proc_lvar_auto( let p = bin_position(&gex_info.gex_barcodes[li], &bc); if p >= 0 { let mut raw_count = 0; - if gex_info.gex_matrices[li].initialized() { - let row = gex_info.gex_matrices[li].row(p as usize); - for (f, n) in row { - if gex_info.is_gex[li][f] { - raw_count += n; - } - } + let z1 = gex_info.h5_indptr[li][p as usize] as usize; + let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? + let d: Vec; + let ind: Vec; + if ctl.gen_opt.h5_pre { + d = h5_data[li].1[z1..z2].to_vec(); + ind = h5_data[li].2[z1..z2].to_vec(); } else { - let z1 = gex_info.h5_indptr[li][p as usize] as usize; - let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? - let d: Vec; - let ind: Vec; - if ctl.gen_opt.h5_pre { - d = h5_data[li].1[z1..z2].to_vec(); - ind = h5_data[li].2[z1..z2].to_vec(); - } else { - d = d_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - ind = ind_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - } - for j in 0..d.len() { - if gex_info.is_gex[li][ind[j] as usize] { - raw_count += d[j] as usize; - } + d = d_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + ind = ind_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + } + for j in 0..d.len() { + if gex_info.is_gex[li][ind[j] as usize] { + raw_count += d[j] as usize; } } + total_counts.push(raw_count); } } @@ -944,42 +918,32 @@ pub fn proc_lvar_auto( let mut entropy = 0.0; let p = bin_position(&gex_info.gex_barcodes[li], bc); if p >= 0 { - if gex_info.gex_matrices[li].initialized() { - let row = gex_info.gex_matrices[li].row(p as usize); - for (f, n) in row { - if gex_info.is_gex[li][f] { - let q = n as f64 / total_counts[l] as f64; - entropy -= q * q.log2(); - } - } + let z1 = gex_info.h5_indptr[li][p as usize] as usize; + let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? + let d: Vec; + let ind: Vec; + if ctl.gen_opt.h5_pre { + d = h5_data[li].1[z1..z2].to_vec(); + ind = h5_data[li].2[z1..z2].to_vec(); } else { - let z1 = gex_info.h5_indptr[li][p as usize] as usize; - let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // is p+1 OK?? - let d: Vec; - let ind: Vec; - if ctl.gen_opt.h5_pre { - d = h5_data[li].1[z1..z2].to_vec(); - ind = h5_data[li].2[z1..z2].to_vec(); - } else { - d = d_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - ind = ind_readers[li] - .as_ref() - .unwrap() - .read_slice(s![z1..z2]) - .unwrap() - .to_vec(); - } - for j in 0..d.len() { - if gex_info.is_gex[li][ind[j] as usize] { - let n = d[j] as usize; - let q = n as f64 / total_counts[l] as f64; - entropy -= q * q.log2(); - } + d = d_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + ind = ind_readers[li] + .as_ref() + .unwrap() + .read_slice(s![z1..z2]) + .unwrap() + .to_vec(); + } + for j in 0..d.len() { + if gex_info.is_gex[li][ind[j] as usize] { + let n = d[j] as usize; + let q = n as f64 / total_counts[l] as f64; + entropy -= q * q.log2(); } } } @@ -1022,88 +986,6 @@ pub fn proc_lvar_auto( }; (d, Vec::new(), "exact") - } else if vname.starts_with("fb") - && vname.ends_with("") - && vname.between2("fb", "").parse::().is_ok() - && vname.between2("fb", "").force_i64() >= 1 - { - let arg1 = vname.between2("fb", "").force_i64(); - let ncols = gex_info.fb_top_matrices[0].ncols(); - let n = (arg1 - 1) as usize; - let fb = if n < ncols { - gex_info.fb_top_matrices[0].col_label(n) - } else { - String::new() - }; - - ((*fb).to_string(), Vec::new(), "exact") - } else if vname.starts_with("fb") - && vname.ends_with("_n") - && vname.between2("fb", "_n").parse::().is_ok() - && vname.between2("fb", "_n").force_i64() >= 1 - { - let arg1 = vname.between2("fb", "_n").force_i64(); - let ncols = gex_info.fb_top_matrices[0].ncols(); - let n = (arg1 - 1) as usize; - let median; - let mut counts; - if n >= ncols { - median = 0; - counts = vec!["0".to_string(); ex.ncells()]; - } else { - counts = Vec::::new(); - let mut counts_sorted = Vec::::new(); - for l in 0..ex.clones.len() { - let bc = ex.clones[l][0].barcode.clone(); - let p = bin_position(&gex_info.fb_top_barcodes[0], &bc); - if p < 0 { - counts.push("0".to_string()); - counts_sorted.push(0); - } else { - let x = gex_info.fb_top_matrices[0].value(p as usize, n); - counts.push(format!("{x}")); - counts_sorted.push(x); - } - } - counts_sorted.sort_unstable(); - median = rounded_median(&counts_sorted); - } - - (format!("{median}"), counts, "cell-exact") - } else if vname.starts_with("fb") - && vname.ends_with("_n_cell") - && vname.between2("fb", "_n_cell").parse::().is_ok() - && vname.between2("fb", "_n_cell").force_i64() >= 1 - { - let arg1 = vname.between2("fb", "_n_cell").force_i64(); - let ncols = gex_info.fb_top_matrices[0].ncols(); - let n = (arg1 - 1) as usize; - let median; - let mut counts; - if n >= ncols { - median = 0; - counts = vec!["0".to_string(); ex.ncells()]; - } else { - counts = Vec::::new(); - let mut counts_sorted = Vec::::new(); - for l in 0..ex.clones.len() { - let bc = ex.clones[l][0].barcode.clone(); - let p = bin_position(&gex_info.fb_top_barcodes[0], &bc); - if p < 0 { - counts.push("0".to_string()); - counts_sorted.push(0); - } else { - let x = gex_info.fb_top_matrices[0].value(p as usize, n); - counts.push(format!("{x}")); - counts_sorted.push(x); - } - } - counts_sorted.sort_unstable(); - median = rounded_median(&counts_sorted); - } - - let _exact = format!("{median}"); - (String::new(), counts, "cell-exact") } else if vname == "filter" { let mut fates = Vec::::new(); for j in 0..ex.clones.len() { From ab4275c961e446d9cd095525d5bf3c7e83e1cf84 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 28 Feb 2024 16:31:38 -0800 Subject: [PATCH 3/7] Fix a couple of fatal clippy lints. --- enclone_print/src/print_clonotypes.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/enclone_print/src/print_clonotypes.rs b/enclone_print/src/print_clonotypes.rs index b21e3c8e8..4efa45eb1 100644 --- a/enclone_print/src/print_clonotypes.rs +++ b/enclone_print/src/print_clonotypes.rs @@ -906,7 +906,6 @@ pub fn print_clonotypes( } } }); - let exacts = exacts; for r in &results { if !r.13.is_empty() { return Err(r.13.clone()); From 369274bc62a2543abe27097c04bdcd26bece665c Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 28 Feb 2024 16:35:52 -0800 Subject: [PATCH 4/7] cargo check completes successfully. --- enclone_ranger/src/stop.rs | 7 ++----- enclone_stuff/src/fcell.rs | 10 +++------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/enclone_ranger/src/stop.rs b/enclone_ranger/src/stop.rs index 2a909990e..5dd88a6f1 100644 --- a/enclone_ranger/src/stop.rs +++ b/enclone_ranger/src/stop.rs @@ -31,7 +31,7 @@ pub fn main_enclone_stop_ranger(mut inter: EncloneIntermediates) -> Result<(), S let mut d_readers = Vec::>::new(); let mut ind_readers = Vec::>::new(); for li in 0..ctl.origin_info.n() { - if !ctl.origin_info.gex_path[li].is_empty() && !gex_info.gex_matrices[li].initialized() { + if !ctl.origin_info.gex_path[li].is_empty() { let x = gex_info.h5_data[li].as_ref(); d_readers.push(Some(x.unwrap().as_reader())); ind_readers.push(Some(gex_info.h5_indices[li].as_ref().unwrap().as_reader())); @@ -46,10 +46,7 @@ pub fn main_enclone_stop_ranger(mut inter: EncloneIntermediates) -> Result<(), S } h5_data.par_iter_mut().for_each(|res| { let li = res.0; - if !ctl.origin_info.gex_path[li].is_empty() - && !gex_info.gex_matrices[li].initialized() - && ctl.gen_opt.h5_pre - { + if !ctl.origin_info.gex_path[li].is_empty() && ctl.gen_opt.h5_pre { res.1 = d_readers[li].as_ref().unwrap().read_raw().unwrap(); res.2 = ind_readers[li].as_ref().unwrap().read_raw().unwrap(); } diff --git a/enclone_stuff/src/fcell.rs b/enclone_stuff/src/fcell.rs index 48a1a3e0c..b1f0d365c 100644 --- a/enclone_stuff/src/fcell.rs +++ b/enclone_stuff/src/fcell.rs @@ -30,8 +30,7 @@ pub fn filter_by_fcell( let mut d_readers = Vec::>::new(); let mut ind_readers = Vec::>::new(); for li in 0..ctl.origin_info.n() { - if !ctl.origin_info.gex_path[li].is_empty() && !gex_info.gex_matrices[li].initialized() - { + if !ctl.origin_info.gex_path[li].is_empty() { let x = gex_info.h5_data[li].as_ref(); if x.is_none() { // THIS FAILS SPORADICALLY, OBSERVED MULTIPLE TIMES, @@ -95,10 +94,7 @@ pub fn filter_by_fcell( } h5_data.par_iter_mut().for_each(|res| { let li = res.0; - if !ctl.origin_info.gex_path[li].is_empty() - && !gex_info.gex_matrices[li].initialized() - && ctl.gen_opt.h5_pre - { + if !ctl.origin_info.gex_path[li].is_empty() && ctl.gen_opt.h5_pre { res.1 = d_readers[li].as_ref().unwrap().read_raw().unwrap(); res.2 = ind_readers[li].as_ref().unwrap().read_raw().unwrap(); } @@ -122,7 +118,7 @@ pub fn filter_by_fcell( let bc = ex.clones[l][0].barcode.clone(); if !gex_info.gex_barcodes.is_empty() { let p = bin_position(&gex_info.gex_barcodes[li], &bc); - if p >= 0 && !gex_info.gex_matrices[li].initialized() { + if p >= 0 { let z1 = gex_info.h5_indptr[li][p as usize] as usize; let z2 = gex_info.h5_indptr[li][p as usize + 1] as usize; // p+1 OK? if ctl.gen_opt.h5_pre { From 22c8e196a0afadc9c67817c6a98992c9bed65a66 Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 28 Feb 2024 16:45:51 -0800 Subject: [PATCH 5/7] Burn down the rest of the unused warnings. --- Cargo.lock | 11 - Cargo.toml | 1 - enclone_args/Cargo.toml | 1 - enclone_args/src/load_gex.rs | 6 - enclone_args/src/load_gex_core.rs | 1 - enclone_core/Cargo.toml | 1 - enclone_core/src/defs.rs | 3 - enclone_print/src/print_utils1.rs | 32 +- enclone_print/src/print_utils4.rs | 5 +- enclone_print/src/proc_lvar2.rs | 8 +- enclone_stuff/src/fcell.rs | 3 +- enclone_vars/src/vars | 59 --- mirror_sparse_matrix/Cargo.toml | 15 - mirror_sparse_matrix/LICENSE | 21 -- mirror_sparse_matrix/README.md | 2 - mirror_sparse_matrix/src/lib.rs | 605 ------------------------------ 16 files changed, 7 insertions(+), 767 deletions(-) delete mode 100644 mirror_sparse_matrix/Cargo.toml delete mode 100644 mirror_sparse_matrix/LICENSE delete mode 100644 mirror_sparse_matrix/README.md delete mode 100644 mirror_sparse_matrix/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index e08a27bfe..20e23baa9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -469,7 +469,6 @@ dependencies = [ "hdf5", "io_utils", "itertools", - "mirror_sparse_matrix", "rand", "rayon", "regex", @@ -494,7 +493,6 @@ dependencies = [ "io_utils", "itertools", "lazy_static", - "mirror_sparse_matrix", "perf_stats", "qd", "rayon", @@ -1087,15 +1085,6 @@ dependencies = [ "adler", ] -[[package]] -name = "mirror_sparse_matrix" -version = "0.1.17" -dependencies = [ - "binary_vec_io", - "io_utils", - "pretty_trace", -] - [[package]] name = "multimap" version = "0.8.3" diff --git a/Cargo.toml b/Cargo.toml index 7178889cd..5207e6968 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,6 @@ members = [ "io_utils", "kmer_lookup", "load_feature_bc", - "mirror_sparse_matrix", "perf_stats", "pretty_trace", "stats_utils", diff --git a/enclone_args/Cargo.toml b/enclone_args/Cargo.toml index e718db55f..cc16c5d5c 100644 --- a/enclone_args/Cargo.toml +++ b/enclone_args/Cargo.toml @@ -30,7 +30,6 @@ evalexpr = ">=7, <12" expr_tools = { path = "../expr_tools" } io_utils = { path = "../io_utils" } itertools.workspace = true -mirror_sparse_matrix = { path = "../mirror_sparse_matrix" } rand = "0.8" rayon = "1" regex = { version = "1", default-features = false, features = ["std", "perf"] } diff --git a/enclone_args/src/load_gex.rs b/enclone_args/src/load_gex.rs index ab5596d68..b1ad8ef13 100644 --- a/enclone_args/src/load_gex.rs +++ b/enclone_args/src/load_gex.rs @@ -7,7 +7,6 @@ use crate::load_gex_core::load_gex; use enclone_core::defs::{EncloneControl, GexInfo}; use hdf5::Dataset; -use mirror_sparse_matrix::MirrorSparseMatrix; use rayon::prelude::*; use std::fmt::Write; use std::{collections::HashMap, time::Instant}; @@ -20,8 +19,6 @@ use vector_utils::{bin_position, unique_sort}; pub fn get_gex_info(ctl: &mut EncloneControl) -> Result { let mut gex_features = Vec::>::new(); let mut gex_barcodes = Vec::>::new(); - let mut fb_top_barcodes = Vec::>::new(); - let mut fb_top_reads_barcodes = Vec::>::new(); let mut fb_total_umis = Vec::::new(); let mut fb_total_reads = Vec::::new(); let mut fb_brn = Vec::>::new(); @@ -45,7 +42,6 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result { ctl, &mut gex_features, &mut gex_barcodes, - &mut fb_top_reads_barcodes, &mut fb_total_umis, &mut fb_total_reads, &mut fb_brn, @@ -156,8 +152,6 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result { Ok(GexInfo { gex_features, gex_barcodes, - fb_top_barcodes, - fb_top_reads_barcodes, fb_total_umis, fb_total_reads, fb_brn, diff --git a/enclone_args/src/load_gex_core.rs b/enclone_args/src/load_gex_core.rs index 8fbb9a48c..fca062e05 100644 --- a/enclone_args/src/load_gex_core.rs +++ b/enclone_args/src/load_gex_core.rs @@ -53,7 +53,6 @@ pub fn load_gex( ctl: &mut EncloneControl, gex_features: &mut Vec>, gex_barcodes: &mut Vec>, - fb_top_reads_barcodes: &mut Vec>, fb_total_umis: &mut Vec, fb_total_reads: &mut Vec, fb_brn: &mut Vec>, diff --git a/enclone_core/Cargo.toml b/enclone_core/Cargo.toml index ad3425e5a..e92329900 100644 --- a/enclone_core/Cargo.toml +++ b/enclone_core/Cargo.toml @@ -31,7 +31,6 @@ evalexpr = ">=7, <12" io_utils = { path = "../io_utils" } itertools.workspace = true lazy_static = "1" -mirror_sparse_matrix = { path = "../mirror_sparse_matrix" } perf_stats = { path = "../perf_stats" } qd = { git = "https://github.com/Barandis/qd" } rayon = "1" diff --git a/enclone_core/src/defs.rs b/enclone_core/src/defs.rs index 7ce181364..484a364f0 100644 --- a/enclone_core/src/defs.rs +++ b/enclone_core/src/defs.rs @@ -7,7 +7,6 @@ use evalexpr::Node; use hdf5::Dataset; use io_utils::{open_for_read, path_exists}; -use mirror_sparse_matrix::MirrorSparseMatrix; use perf_stats::elapsed; #[cfg(not(target_os = "windows"))] @@ -829,8 +828,6 @@ pub struct CloneInfo { pub struct GexInfo { pub gex_features: Vec>, pub gex_barcodes: Vec>, - pub fb_top_barcodes: Vec>, - pub fb_top_reads_barcodes: Vec>, pub fb_total_umis: Vec, pub fb_total_reads: Vec, pub fb_brn: Vec>, diff --git a/enclone_print/src/print_utils1.rs b/enclone_print/src/print_utils1.rs index 57accb288..bc3689f96 100644 --- a/enclone_print/src/print_utils1.rs +++ b/enclone_print/src/print_utils1.rs @@ -5,7 +5,7 @@ use ansi_escape::{ emit_bold_escape, emit_eight_bit_color_escape, emit_end_escape, emit_red_escape, }; use enclone_core::cell_color::CellColor; -use enclone_core::defs::{ColInfo, EncloneControl, ExactClonotype, GexInfo, TigData1, POUT_SEP}; +use enclone_core::defs::{ColInfo, EncloneControl, ExactClonotype, TigData1, POUT_SEP}; use enclone_core::print_tools::{color_by_property, emit_codon_color_escape}; use enclone_vars::decode_arith; use expr_tools::vars_of_node; @@ -849,36 +849,6 @@ pub fn cdr3_aa_con( c } -pub fn get_gex_matrix_entry( - ctl: &EncloneControl, - gex_info: &GexInfo, - fid: usize, - d_all: &[Vec], - ind_all: &[Vec], - li: usize, - l: usize, - p: usize, - y: &str, -) -> f64 { - let mut raw_count = 0 as f64; - for (&da, &ia) in d_all[l].iter().zip(ind_all[l].iter()) { - if ia == fid as u32 { - raw_count = da as f64; - break; - } - } - - let mult = if y.ends_with("_g") { - gex_info.gex_mults[li] - } else { - gex_info.fb_mults[li] - }; - if !ctl.gen_opt.full_counts { - raw_count *= mult; - } - raw_count -} - // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ pub fn extra_args(ctl: &EncloneControl) -> Vec { diff --git a/enclone_print/src/print_utils4.rs b/enclone_print/src/print_utils4.rs index 9e15821a9..04b310d67 100644 --- a/enclone_print/src/print_utils4.rs +++ b/enclone_print/src/print_utils4.rs @@ -23,7 +23,6 @@ pub fn get_gex_matrix_entry( ind_all: &[Vec], li: usize, l: usize, - p: usize, y: &str, ) -> f64 { let mut raw_count = 0 as f64; @@ -731,14 +730,14 @@ pub fn compute_bu( computed = true; for fid in ux.iter() { let counti = get_gex_matrix_entry( - ctl, gex_info, *fid, d_all, ind_all, li, l, p as usize, y, + ctl, gex_info, *fid, d_all, ind_all, li, l, y, ); count += counti; } } else if let Some(&fid) = gex_info.feature_id[li].get(&y.to_string()) { computed = true; count = get_gex_matrix_entry( - ctl, gex_info, fid, d_all, ind_all, li, l, p as usize, y, + ctl, gex_info, fid, d_all, ind_all, li, l, y, ); } } diff --git a/enclone_print/src/proc_lvar2.rs b/enclone_print/src/proc_lvar2.rs index 5a3510693..20c4ebc99 100644 --- a/enclone_print/src/proc_lvar2.rs +++ b/enclone_print/src/proc_lvar2.rs @@ -122,9 +122,8 @@ pub fn proc_lvar2( computed = true; let mut raw_count = 0.0; for fid in ux.iter() { - let raw_counti = get_gex_matrix_entry( - ctl, gex_info, *fid, d_all, ind_all, li, l, p as usize, y, - ); + let raw_counti = + get_gex_matrix_entry(ctl, gex_info, *fid, d_all, ind_all, li, l, y); raw_count += raw_counti; } counts_sub.push(raw_count.round() as usize); @@ -135,8 +134,7 @@ pub fn proc_lvar2( let p = bin_position(&gex_info.gex_barcodes[li], &bc); if p >= 0 { let fid = gex_info.feature_id[li][&y.to_string()]; - let raw_count = - get_gex_matrix_entry(ctl, gex_info, fid, d_all, ind_all, li, l, p as usize, y); + let raw_count = get_gex_matrix_entry(ctl, gex_info, fid, d_all, ind_all, li, l, y); counts_sub.push(raw_count.round() as usize); fcounts_sub.push(raw_count); } diff --git a/enclone_stuff/src/fcell.rs b/enclone_stuff/src/fcell.rs index b1f0d365c..be60e3142 100644 --- a/enclone_stuff/src/fcell.rs +++ b/enclone_stuff/src/fcell.rs @@ -172,8 +172,7 @@ pub fn filter_by_fcell( let p = bin_position(&gex_info.gex_barcodes[li], bc); if p >= 0 { let raw_count = get_gex_matrix_entry( - ctl, gex_info, fid, &d_all, &ind_all, li, l, - p as usize, var, + ctl, gex_info, fid, &d_all, &ind_all, li, l, var, ); val = format!("{raw_count:.2}"); } diff --git a/enclone_vars/src/vars b/enclone_vars/src/vars index 8ffb72078..5c7520e53 100644 --- a/enclone_vars/src/vars +++ b/enclone_vars/src/vars @@ -2046,65 +2046,6 @@ code: let mut dist = -1_isize; } exact: d ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -name: fb{1..} -inputs: lvar_vdj -limits: -class: lvar -level: exact -val: string -doc: TBD -brief: sequence of the nth most frequent feature barcode -page: enclone help lvars -avail: private -notes: -code: let ncols = gex_info.fb_top_matrices[0].ncols(); - let n = (arg1 - 1) as usize; - let fb = if n < ncols { - gex_info.fb_top_matrices[0].col_label(n) - } else { - String::new() - }; - exact: (*fb).to_string() -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -name: fb{1..}_n -inputs: lvar_vdj -limits: -class: lvar -level: cell-exact -val: nonnegative integer -doc: TBD -brief: number of UMIs for the nth most frequent feature barcode -page: enclone help lvars -avail: private -notes: -code: let ncols = gex_info.fb_top_matrices[0].ncols(); - let n = (arg1 - 1) as usize; - let median; - let mut counts; - if n >= ncols { - median = 0; - counts = vec!["0".to_string(); ex.ncells()]; - } else { - counts = Vec::::new(); - let mut counts_sorted = Vec::::new(); - for l in 0..ex.clones.len() { - let bc = ex.clones[l][0].barcode.clone(); - let p = bin_position(&gex_info.fb_top_barcodes[0], &bc); - if p < 0 { - counts.push("0".to_string()); - counts_sorted.push(0); - } else { - let x = gex_info.fb_top_matrices[0].value(p as usize, n); - counts.push(format!("{}", x)); - counts_sorted.push(x); - } - } - counts_sorted.sort_unstable(); - median = rounded_median(&counts_sorted); - } - cell: counts - exact: format!("{}", median) -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ name: filter inputs: lvar_vdj limits: diff --git a/mirror_sparse_matrix/Cargo.toml b/mirror_sparse_matrix/Cargo.toml deleted file mode 100644 index 66dad9cd6..000000000 --- a/mirror_sparse_matrix/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -name = "mirror_sparse_matrix" -version = "0.1.17" -authors = ["David Jaffe "] -license = "MIT" -description = "Some tools that are 'internal' for now because they are insufficiently refined and unstable, but which are used by other 'public' crates." -edition = "2018" -repository = "https://github.com/10XGenomics/enclone_ranger" - -[dependencies] -binary_vec_io = { path = "../binary_vec_io" } - -[dev-dependencies] -io_utils = { path = "../io_utils" } -pretty_trace = { path = "../pretty_trace" } diff --git a/mirror_sparse_matrix/LICENSE b/mirror_sparse_matrix/LICENSE deleted file mode 100644 index 2837d2442..000000000 --- a/mirror_sparse_matrix/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2018-2019 10x Genomics, Inc. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/mirror_sparse_matrix/README.md b/mirror_sparse_matrix/README.md deleted file mode 100644 index 4cad8e1d5..000000000 --- a/mirror_sparse_matrix/README.md +++ /dev/null @@ -1,2 +0,0 @@ -This crate contains utilities that are used by the crate -pretty_trace. In time the contents of this crate may be documented, but for now, we recommend against directly using them, as they may be changed. diff --git a/mirror_sparse_matrix/src/lib.rs b/mirror_sparse_matrix/src/lib.rs deleted file mode 100644 index a9412ecb5..000000000 --- a/mirror_sparse_matrix/src/lib.rs +++ /dev/null @@ -1,605 +0,0 @@ -// Copyright (c) 2019 10X Genomics, Inc. All rights reserved. - -// This file describes a data structure that reasonably efficiently encodes a non-editable sparse -// matrix of nonnegative integers, each < 2^32, and most which are very small, together with -// string labels for rows and columns. The number of rows and columns are assumed to be ≤ 2^32. -// The structure is optimized for the case where the number of columns is ≤ 2^16. Space -// requirements roughly double if this is exceeded. -// -// The defining property and raison d'être for this data structure is that it can be read directly -// ("mirrored") as a single block of bytes, and thus reading is fast and especially fast on -// subsequent reads once the data are cached. This makes it appropriate for interactive -// applications. But note that actually extracting values from the data structure can be slow. -// -// The data structure is laid out as a vector of bytes, representing a mix of u8, u16 and u32 -// entries, in general unaligned, as follow in the case where the number of columns is ≤ 2^16: -// -// 1. "MirrorSparseMatrix binary file \n" (32 bytes) -// 2. code version (4 bytes) -// 3. storage version (4 bytes) -// 4. number of rows (u32) = n -// 5. number of columns (u32) = k *** ADDED *** -// 6. for each i in 0..n, byte start of data for row i (n x u32) -// 7. for each i in 0..=n, byte start of string label for row i *** ADDED *** -// 8. for each j in 0..=k, byte start of string label for column j *** ADDED *** -// 9. for each row: -// (a) number of sparse entries whose value is stored in u8 (u16) = m1 -// (b) number of sparse entries whose value is stored in u16 (u16) = m2 -// (c) number of sparse entries whose value is stored in u32 (u16) = m4 [note redundant] -// (a') data for (a), m1 entries of form -// - column identifier (u16) -// - value (u8) -// (b') data for (b), m2 entries of form -// - column identifier (u16) -// - value (u16) -// (c') data for (c), m4 entries of form -// - column identifier (u16) -// - value (u32). -// -// The case where the number of columns is > 2^16 is the same except that all the u16 entries are -// changed to u32. -// -// Initial API: -// * read from disk -// * write to disk -// * build from (Vec, Vec, Vec) representation -// * report the number of rows and the number of columns -// * report the sum of entries for a given row -// * report the sum of entries for a given column -// * return the value of a given matrix entry -// * return a given row -// * return a row or column label. -// -// The initial version was 0. In version 1, string labels for rows and columns were added. -// A version 0 file can no longer be read, except that, you can read the version and exit. -// All functions after item 4 above will assert strangely or return garbage. It would be -// better to first call get_code_version_from_file. - -use std::cmp::max; - -#[derive(Clone)] -pub struct MirrorSparseMatrix { - x: Vec, -} - -fn get_u8_at_pos(v: &[u8], pos: usize) -> u8 { - v[pos] -} - -fn get_u16_at_pos(v: &[u8], pos: usize) -> u16 { - let mut z = [0_u8; 2]; - z.clone_from_slice(&v[pos..(2 + pos)]); - u16::from_le_bytes(z) -} - -fn get_u32_at_pos(v: &[u8], pos: usize) -> u32 { - let mut z = [0_u8; 4]; - z.clone_from_slice(&v[pos..(4 + pos)]); - u32::from_le_bytes(z) -} - -fn _put_u8_at_pos(v: &mut [u8], pos: usize, val: u8) { - v[pos] = val; -} - -fn _put_u16_at_pos(v: &mut [u8], pos: usize, val: u16) { - let z = val.to_le_bytes(); - v[pos..(2 + pos)].clone_from_slice(&z[..2]); -} - -fn put_u32_at_pos(v: &mut [u8], pos: usize, val: u32) { - let z = val.to_le_bytes(); - v[pos..(4 + pos)].clone_from_slice(&z[..4]); -} - -fn push_u8(v: &mut Vec, val: u8) { - v.push(val); -} - -fn push_u16(v: &mut Vec, val: u16) { - let z = val.to_le_bytes(); - for i in 0..2 { - v.push(z[i]); - } -} - -fn push_u32(v: &mut Vec, val: u32) { - let z = val.to_le_bytes(); - for i in 0..4 { - v.push(z[i]); - } -} - -impl MirrorSparseMatrix { - pub fn new() -> MirrorSparseMatrix { - let v = Vec::::new(); - MirrorSparseMatrix { x: v } - } - - pub fn initialized(&self) -> bool { - !self.x.is_empty() - } - - fn header_size() -> usize { - 32 // text header - + 4 // code version - + 4 // storage version - + 4 // number of rows - + 4 // number of columns - } - - fn code_version(&self) -> usize { - get_u32_at_pos(&self.x, 32) as usize - } - - fn storage_version(&self) -> usize { - get_u32_at_pos(&self.x, 36) as usize - } - - pub fn build_from_vec( - x: &[Vec<(i32, i32)>], - row_labels: &[String], - col_labels: &[String], - ) -> MirrorSparseMatrix { - let mut max_col = 0_i32; - for i in 0..x.len() { - for j in 0..x[i].len() { - max_col = max(max_col, x[i][j].0); - } - } - let mut storage_version = 0_u32; - if max_col >= 65536 { - storage_version = 1_u32; - } - let hs = MirrorSparseMatrix::header_size(); - let mut v = Vec::::new(); - let mut total_bytes = hs + 4 * x.len(); - for i in 0..x.len() { - let (mut m1, mut m2, mut m4) = (0, 0, 0); - for j in 0..x[i].len() { - if x[i][j].1 < 256 { - m1 += 1; - } else if x[i][j].1 < 65536 { - m2 += 1; - } else { - m4 += 1; - } - } - if storage_version == 0 { - total_bytes += 6 + 3 * m1 + 4 * m2 + 6 * m4; - } else { - total_bytes += 12 + 5 * m1 + 6 * m2 + 8 * m4; - } - } - let (n, k) = (row_labels.len(), col_labels.len()); - assert_eq!(n, x.len()); - total_bytes += 4 * (1 + n); - total_bytes += 4 * (1 + k); - let byte_start_of_row_labels = total_bytes; - for i in 0..n { - total_bytes += row_labels[i].len(); - } - let byte_start_of_col_labels = total_bytes; - for j in 0..k { - total_bytes += col_labels[j].len(); - } - v.reserve(total_bytes); - v.append(&mut b"MirrorSparseMatrix binary file \n".to_vec()); - assert_eq!(v.len(), 32); - const CURRENT_CODE_VERSION: usize = 1; - let code_version = CURRENT_CODE_VERSION as u32; - push_u32(&mut v, code_version); - push_u32(&mut v, storage_version); - push_u32(&mut v, n as u32); - push_u32(&mut v, k as u32); - assert_eq!(v.len(), hs); - for _ in 0..n { - push_u32(&mut v, 0_u32); - } - - // Define row and column label starts. - - let mut pos = byte_start_of_row_labels; - for i in 0..=n { - push_u32(&mut v, pos as u32); - if i < n { - pos += row_labels[i].len(); - } - } - let mut pos = byte_start_of_col_labels; - for j in 0..=k { - push_u32(&mut v, pos as u32); - if j < k { - pos += col_labels[j].len(); - } - } - - // Insert matrix entries. - - for i in 0..n { - let p = v.len() as u32; - put_u32_at_pos(&mut v, hs + 4 * i, p); - let (mut m1, mut m2, mut m4) = (0, 0, 0); - for j in 0..x[i].len() { - if x[i][j].1 < 256 { - m1 += 1; - } else if x[i][j].1 < 65536 { - m2 += 1; - } else { - m4 += 1; - } - } - if storage_version == 0 { - push_u16(&mut v, m1 as u16); - push_u16(&mut v, m2 as u16); - push_u16(&mut v, m4 as u16); - } else { - push_u32(&mut v, m1 as u32); - push_u32(&mut v, m2 as u32); - push_u32(&mut v, m4 as u32); - } - for j in 0..x[i].len() { - if x[i][j].1 < 256 { - if storage_version == 0 { - push_u16(&mut v, x[i][j].0 as u16); - } else { - push_u32(&mut v, x[i][j].0 as u32); - } - push_u8(&mut v, x[i][j].1 as u8); - } - } - for j in 0..x[i].len() { - if x[i][j].1 >= 256 && x[i][j].1 < 65536 { - if storage_version == 0 { - push_u16(&mut v, x[i][j].0 as u16); - } else { - push_u32(&mut v, x[i][j].0 as u32); - } - push_u16(&mut v, x[i][j].1 as u16); - } - } - for j in 0..x[i].len() { - if x[i][j].1 >= 65536 { - if storage_version == 0 { - push_u16(&mut v, x[i][j].0 as u16); - } else { - push_u32(&mut v, x[i][j].0 as u32); - } - push_u32(&mut v, x[i][j].1 as u32); - } - } - } - - // Insert row and column labels. - - for i in 0..n { - for p in 0..row_labels[i].len() { - v.push(row_labels[i].as_bytes()[p]); - } - } - for j in 0..k { - for p in 0..col_labels[j].len() { - v.push(col_labels[j].as_bytes()[p]); - pos += 1; - } - } - - // Done. - - assert_eq!(total_bytes, v.len()); - MirrorSparseMatrix { x: v } - } - - pub fn nrows(&self) -> usize { - get_u32_at_pos(&self.x, 40) as usize - } - - pub fn ncols(&self) -> usize { - get_u32_at_pos(&self.x, 44) as usize - } - - fn start_of_row(&self, row: usize) -> usize { - let pos = MirrorSparseMatrix::header_size() + row * 4; - get_u32_at_pos(&self.x, pos) as usize - } - - pub fn row_label(&self, i: usize) -> String { - let row_labels_start = MirrorSparseMatrix::header_size() + self.nrows() * 4; - let label_start = get_u32_at_pos(&self.x, row_labels_start + i * 4); - let label_stop = get_u32_at_pos(&self.x, row_labels_start + (i + 1) * 4); - let label_bytes = &self.x[label_start as usize..label_stop as usize]; - String::from_utf8(label_bytes.to_vec()).unwrap() - } - - pub fn col_label(&self, j: usize) -> String { - let col_labels_start = MirrorSparseMatrix::header_size() + self.nrows() * 8 + 4; - let label_start = get_u32_at_pos(&self.x, col_labels_start + j * 4); - let label_stop = get_u32_at_pos(&self.x, col_labels_start + (j + 1) * 4); - let label_bytes = &self.x[label_start as usize..label_stop as usize]; - String::from_utf8(label_bytes.to_vec()).unwrap() - } - - pub fn row(&self, row: usize) -> Vec<(usize, usize)> { - let mut all = Vec::<(usize, usize)>::new(); - let s = self.start_of_row(row); - if self.storage_version() == 0 { - let m1 = get_u16_at_pos(&self.x, s) as usize; - let m2 = get_u16_at_pos(&self.x, s + 2) as usize; - let m4 = get_u16_at_pos(&self.x, s + 4) as usize; - for i in 0..m1 { - let pos = s + 6 + 3 * i; - let col = get_u16_at_pos(&self.x, pos) as usize; - let entry = get_u8_at_pos(&self.x, pos + 2) as usize; - all.push((col, entry)); - } - for i in 0..m2 { - let pos = s + 6 + 3 * m1 + 4 * i; - let col = get_u16_at_pos(&self.x, pos) as usize; - let entry = get_u16_at_pos(&self.x, pos + 2) as usize; - all.push((col, entry)); - } - for i in 0..m4 { - let pos = s + 6 + 3 * m1 + 4 * m2 + 6 * i; - let col = get_u16_at_pos(&self.x, pos) as usize; - let entry = get_u32_at_pos(&self.x, pos + 2) as usize; - all.push((col, entry)); - } - } else { - let m1 = get_u32_at_pos(&self.x, s) as usize; - let m2 = get_u32_at_pos(&self.x, s + 4) as usize; - let m4 = get_u32_at_pos(&self.x, s + 8) as usize; - for i in 0..m1 { - let pos = s + 12 + 5 * i; - let col = get_u32_at_pos(&self.x, pos) as usize; - let entry = get_u8_at_pos(&self.x, pos + 4) as usize; - all.push((col, entry)); - } - for i in 0..m2 { - let pos = s + 12 + 5 * m1 + 6 * i; - let col = get_u32_at_pos(&self.x, pos) as usize; - let entry = get_u16_at_pos(&self.x, pos + 4) as usize; - all.push((col, entry)); - } - for i in 0..m4 { - let pos = s + 12 + 5 * m1 + 6 * m2 + 8 * i; - let col = get_u32_at_pos(&self.x, pos) as usize; - let entry = get_u32_at_pos(&self.x, pos + 4) as usize; - all.push((col, entry)); - } - } - all.sort_unstable(); - all - } - - pub fn sum_of_row(&self, row: usize) -> usize { - let s = self.start_of_row(row); - let mut sum = 0; - if self.storage_version() == 0 { - let m1 = get_u16_at_pos(&self.x, s) as usize; - let m2 = get_u16_at_pos(&self.x, s + 2) as usize; - let m4 = get_u16_at_pos(&self.x, s + 4) as usize; - for i in 0..m1 { - let pos = s + 6 + 3 * i + 2; - sum += get_u8_at_pos(&self.x, pos) as usize; - } - for i in 0..m2 { - let pos = s + 6 + 3 * m1 + 4 * i + 2; - sum += get_u16_at_pos(&self.x, pos) as usize; - } - for i in 0..m4 { - let pos = s + 6 + 3 * m1 + 4 * m2 + 6 * i + 2; - sum += get_u32_at_pos(&self.x, pos) as usize; - } - } else { - let m1 = get_u32_at_pos(&self.x, s) as usize; - let m2 = get_u32_at_pos(&self.x, s + 4) as usize; - let m4 = get_u32_at_pos(&self.x, s + 8) as usize; - for i in 0..m1 { - let pos = s + 12 + 5 * i + 4; - sum += get_u8_at_pos(&self.x, pos) as usize; - } - for i in 0..m2 { - let pos = s + 12 + 5 * m1 + 6 * i + 4; - sum += get_u16_at_pos(&self.x, pos) as usize; - } - for i in 0..m4 { - let pos = s + 12 + 5 * m1 + 6 * m2 + 8 * i + 4; - sum += get_u32_at_pos(&self.x, pos) as usize; - } - } - sum - } - - pub fn sum_of_col(&self, col: usize) -> usize { - let mut sum = 0; - if self.storage_version() == 0 { - for row in 0..self.nrows() { - let s = self.start_of_row(row); - let m1 = get_u16_at_pos(&self.x, s) as usize; - let m2 = get_u16_at_pos(&self.x, s + 2) as usize; - let m4 = get_u16_at_pos(&self.x, s + 4) as usize; - for i in 0..m1 { - let pos = s + 6 + 3 * i; - let f = get_u16_at_pos(&self.x, pos) as usize; - if f == col { - sum += get_u8_at_pos(&self.x, pos + 2) as usize; - } - } - for i in 0..m2 { - let pos = s + 6 + 3 * m1 + 4 * i; - let f = get_u16_at_pos(&self.x, pos) as usize; - if f == col { - sum += get_u16_at_pos(&self.x, pos + 2) as usize; - } - } - for i in 0..m4 { - let pos = s + 6 + 3 * m1 + 4 * m2 + 6 * i; - let f = get_u16_at_pos(&self.x, pos) as usize; - if f == col { - sum += get_u32_at_pos(&self.x, pos + 2) as usize; - } - } - } - } else { - for row in 0..self.nrows() { - let s = self.start_of_row(row); - let m1 = get_u32_at_pos(&self.x, s) as usize; - let m2 = get_u32_at_pos(&self.x, s + 4) as usize; - let m4 = get_u32_at_pos(&self.x, s + 8) as usize; - for i in 0..m1 { - let pos = s + 12 + 5 * i; - let f = get_u32_at_pos(&self.x, pos) as usize; - if f == col { - sum += get_u8_at_pos(&self.x, pos + 4) as usize; - } - } - for i in 0..m2 { - let pos = s + 12 + 5 * m1 + 6 * i; - let f = get_u32_at_pos(&self.x, pos) as usize; - if f == col { - sum += get_u16_at_pos(&self.x, pos + 4) as usize; - } - } - for i in 0..m4 { - let pos = s + 12 + 5 * m1 + 6 * m2 + 8 * i; - let f = get_u32_at_pos(&self.x, pos) as usize; - if f == col { - sum += get_u32_at_pos(&self.x, pos + 4) as usize; - } - } - } - } - sum - } - - pub fn value(&self, row: usize, col: usize) -> usize { - let s = self.start_of_row(row); - if self.storage_version() == 0 { - let m1 = get_u16_at_pos(&self.x, s) as usize; - let m2 = get_u16_at_pos(&self.x, s + 2) as usize; - let m4 = get_u16_at_pos(&self.x, s + 4) as usize; - for i in 0..m1 { - let pos = s + 6 + 3 * i; - let f = get_u16_at_pos(&self.x, pos) as usize; - if f == col { - return get_u8_at_pos(&self.x, pos + 2) as usize; - } - } - for i in 0..m2 { - let pos = s + 6 + 3 * m1 + 4 * i; - let f = get_u16_at_pos(&self.x, pos) as usize; - if f == col { - return get_u16_at_pos(&self.x, pos + 2) as usize; - } - } - for i in 0..m4 { - let pos = s + 6 + 3 * m1 + 4 * m2 + 6 * i; - let f = get_u16_at_pos(&self.x, pos) as usize; - if f == col { - return get_u32_at_pos(&self.x, pos + 2) as usize; - } - } - 0 - } else { - let m1 = get_u32_at_pos(&self.x, s) as usize; - let m2 = get_u32_at_pos(&self.x, s + 4) as usize; - let m4 = get_u32_at_pos(&self.x, s + 8) as usize; - for i in 0..m1 { - let pos = s + 12 + 5 * i; - let f = get_u32_at_pos(&self.x, pos) as usize; - if f == col { - return get_u8_at_pos(&self.x, pos + 4) as usize; - } - } - for i in 0..m2 { - let pos = s + 12 + 5 * m1 + 6 * i; - let f = get_u32_at_pos(&self.x, pos) as usize; - if f == col { - return get_u16_at_pos(&self.x, pos + 4) as usize; - } - } - for i in 0..m4 { - let pos = s + 12 + 5 * m1 + 6 * m2 + 8 * i; - let f = get_u32_at_pos(&self.x, pos) as usize; - if f == col { - return get_u32_at_pos(&self.x, pos + 4) as usize; - } - } - 0 - } - } -} - -impl Default for MirrorSparseMatrix { - fn default() -> Self { - Self::new() - } -} - -#[cfg(test)] -mod tests { - - // test with: cargo test -p mirror_sparse_matrix -- --nocapture - - use super::*; - use io_utils::printme; - use pretty_trace::PrettyTrace; - - #[test] - fn test_mirror_sparse_matrix() { - PrettyTrace::new().on(); - // Dated comment: - // We have observed that with cargo test --release, tracebacks here can be incomplete, - // and indeed this is true even if one doesn't use pretty trace. In such cases, running - // without --release works. - // Really should document this example in pretty_trace. - for storage_version in 0..2 { - printme!(storage_version); - let mut x = Vec::>::new(); - let (n, k) = (10, 100); - for i in 0..n { - let mut y = Vec::<(i32, i32)>::new(); - for j in 0..k { - let col = if storage_version == 0 { - i + j - } else { - 10000 * i + j - }; - y.push((col, (i * i * j))); - } - x.push(y); - } - let test_row = 9; - let mut row_sum = 0; - for j in 0..x[test_row].len() { - row_sum += x[test_row][j].1 as usize; - } - let (mut row_labels, mut col_labels) = (Vec::::new(), Vec::::new()); - for i in 0..n { - row_labels.push(format!("row {i}")); - } - for j in 0..k { - col_labels.push(format!("col {j}")); - } - let y = MirrorSparseMatrix::build_from_vec(&x, &row_labels, &col_labels); - let row_sum2 = y.sum_of_row(test_row); - assert_eq!(row_sum, row_sum2); - let test_col = if storage_version == 0 { 15 } else { 90001 }; - let mut col_sum = 0; - for i in 0..x.len() { - for j in 0..x[i].len() { - assert_eq!(x[i][j].1 as usize, y.value(i, x[i][j].0 as usize)); - if x[i][j].0 as usize == test_col { - col_sum += x[i][j].1 as usize; - } - } - } - let col_sum2 = y.sum_of_col(test_col); - printme!(col_sum, col_sum2); - assert_eq!(col_sum, col_sum2); - assert_eq!(y.storage_version(), storage_version); - assert_eq!(y.row_label(5), row_labels[5]); - assert_eq!(y.col_label(7), col_labels[7]); - } - } -} From d9ac576bfc4a42a57d05cb53d1cbc8b935685d6f Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 28 Feb 2024 17:23:06 -0800 Subject: [PATCH 6/7] Delete the TOP_GENES option. --- enclone/src/UNDOC_OPTIONS | 3 --- enclone_args/src/proc_args.rs | 1 - enclone_core/src/defs.rs | 1 - 3 files changed, 5 deletions(-) diff --git a/enclone/src/UNDOC_OPTIONS b/enclone/src/UNDOC_OPTIONS index 6b480bce2..2590887ed 100644 --- a/enclone/src/UNDOC_OPTIONS +++ b/enclone/src/UNDOC_OPTIONS @@ -278,9 +278,6 @@ This won't work with paging, however you can pipe to "less -r". ROW_FILL_VERBOSE: special option for debugging -TOP_GENES: list genes having the highest expression in the selected clonotypes, taking the median -across all cells (only implemented if .bin file has been generated using NH5) - COMPE: COMP, plus enforce no unaccounted time (except up to 0.02 seconds) UNACCOUNTED: show unaccounted time at each step EVIL_EYE: print logging to facilitate diagnosis of mysterious hanging diff --git a/enclone_args/src/proc_args.rs b/enclone_args/src/proc_args.rs index 79ad45a81..2047707d6 100644 --- a/enclone_args/src/proc_args.rs +++ b/enclone_args/src/proc_args.rs @@ -494,7 +494,6 @@ pub fn proc_args(ctl: &mut EncloneControl, args: &[String]) -> Result<(), String "SUPPRESS_ISOTYPE_LEGEND", &mut ctl.plot_opt.plot_by_isotype_nolegend, ), - ("TOP_GENES", &mut ctl.gen_opt.top_genes), ("TOY", &mut ctl.gen_opt.toy), ("TOY_COM", &mut ctl.gen_opt.toy_com), ("UMI_FILT_MARK", &mut ctl.clono_filt_opt_def.umi_filt_mark), diff --git a/enclone_core/src/defs.rs b/enclone_core/src/defs.rs index 484a364f0..c22b94a84 100644 --- a/enclone_core/src/defs.rs +++ b/enclone_core/src/defs.rs @@ -217,7 +217,6 @@ pub struct GeneralOpt { pub row_fill_verbose: bool, pub config_file: String, pub config: HashMap, - pub top_genes: bool, pub toy_com: bool, pub chains_to_align: Vec, pub chains_to_align2: Vec, From 85bfba42f099a63b60351a33972a56aabcd784ac Mon Sep 17 00:00:00 2001 From: Chris Macklin Date: Wed, 28 Feb 2024 17:29:08 -0800 Subject: [PATCH 7/7] Rip out the ALL_BC feature. --- enclone/src/UNDOC_OPTIONS | 24 ------------------------ enclone_args/src/proc_args_post.rs | 11 ----------- enclone_args/src/process_special_arg1.rs | 22 ---------------------- enclone_core/src/defs.rs | 4 ---- 4 files changed, 61 deletions(-) diff --git a/enclone/src/UNDOC_OPTIONS b/enclone/src/UNDOC_OPTIONS index 2590887ed..004dbebbd 100644 --- a/enclone/src/UNDOC_OPTIONS +++ b/enclone/src/UNDOC_OPTIONS @@ -313,30 +313,6 @@ SUBSAMPLE: subsample barcodes at the indicated fraction; at present this is deli =================================================================================================== -ALL_BC=filename,field1,...,fieldn - -Dump a CSV file with fields - - dataset,barcode,field1,...,fieldn - -to filename, with one line for each barcode. All barcodes in the data files are included, whether -or not they correspond to VDJ cells. This would correspond (at least approximately) to all -whitelisted barcodes in the raw data. - -Only certain other fields are allowed: - -1. feature variables, e.g. CDR3_ab, representing the UMI count -2. gex = total gene expression UMI count -3. type = the cell type -4. clust = cluster id, from analysis_csv/clustering/graphclust/clusters.csv -5. cell = vdj or gex or gex_vdj or empty. - -This may also be used with VAR_DEF. - -ALL_BCH: same but human readable instead of CSV - -=================================================================================================== - PRE_EVAL: evaluate sensitivity and specificity before calling print_clonotypes JOIN_BASIC_H=n: use a special join heuristic as follows (for demonstration purposes only) diff --git a/enclone_args/src/proc_args_post.rs b/enclone_args/src/proc_args_post.rs index 2eecc22a9..b0a0ea350 100644 --- a/enclone_args/src/proc_args_post.rs +++ b/enclone_args/src/proc_args_post.rs @@ -325,17 +325,6 @@ pub fn proc_args_post( } } - // Substitute VAR_DEF into ALL_BC. - - for i in 0..ctl.gen_opt.all_bc_fields.len() { - for j in 0..ctl.gen_opt.var_def.len() { - if ctl.gen_opt.all_bc_fields[i] == ctl.gen_opt.var_def[j].0 { - ctl.gen_opt.all_bc_fields[i] = ctl.gen_opt.var_def[j].3.clone(); - break; - } - } - } - // Sanity check grouping arguments. if ctl.clono_group_opt.style == "asymmetric" diff --git a/enclone_args/src/process_special_arg1.rs b/enclone_args/src/process_special_arg1.rs index c7cf2ce8f..161e15327 100644 --- a/enclone_args/src/process_special_arg1.rs +++ b/enclone_args/src/process_special_arg1.rs @@ -65,28 +65,6 @@ pub fn process_special_arg1( return Err(format!("\nArgument {arg} is not properly specified.\n")); } ctl.gen_opt.chains_to_jun_align2.push(n.force_usize()); - } else if arg.starts_with("ALL_BC=") || arg.starts_with("ALL_BCH=") { - let parts; - if arg.starts_with("ALL_BC=") { - parts = arg.after("ALL_BC=").split(',').collect::>(); - } else { - parts = arg.after("ALL_BCH=").split(',').collect::>(); - ctl.gen_opt.all_bc_human = true; - } - if parts.is_empty() || parts[0].is_empty() { - return Err( - "\nFor ALL_BC/ALL_BCH, at a minimum, a filename must be provided.\n".to_string(), - ); - } - if !ctl.gen_opt.all_bc_filename.is_empty() { - return Err("\nThe argument ALL_BC/ALL_BCH may only be used once.\n".to_string()); - } - ctl.gen_opt.all_bc_filename = parts[0].to_string(); - test_writeable(&ctl.gen_opt.all_bc_filename, ctl.gen_opt.evil_eye)?; - ctl.gen_opt - .all_bc_fields - .extend(parts.into_iter().skip(1).map(str::to_string)); - ctl.gen_opt.all_bc_fields_orig = ctl.gen_opt.all_bc_fields.clone(); } else if arg.starts_with("STATE_NARRATIVE=") { let mut narrative = arg.after("STATE_NARRATIVE=").to_string(); if narrative.starts_with('@') { diff --git a/enclone_core/src/defs.rs b/enclone_core/src/defs.rs index c22b94a84..6cc3452f5 100644 --- a/enclone_core/src/defs.rs +++ b/enclone_core/src/defs.rs @@ -242,10 +242,6 @@ pub struct GeneralOpt { pub var_def: Vec<(String, String, Node, String)>, // {(variable, value, compiled value, expr)} pub nospaces: bool, pub subsample: f64, - pub all_bc_filename: String, - pub all_bc_human: bool, - pub all_bc_fields: Vec, - pub all_bc_fields_orig: Vec, pub gamma_delta: bool, pub pre_eval: bool, pub pre_eval_show: bool,