10XGenomics · macklin-10x · Feb 29, 2024 · Feb 28, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,7 +24,6 @@ members = [
     "io_utils",
     "kmer_lookup",
     "load_feature_bc",
-    "mirror_sparse_matrix",
     "perf_stats",
     "pretty_trace",
     "stats_utils",

diff --git a/enclone/src/UNDOC_OPTIONS b/enclone/src/UNDOC_OPTIONS
@@ -1,11 +1,12 @@
 
-This file contains enclone options that are not in the online documentation for enclone.  They are 
-here for a reason, for example, maybe they are buggy, or untested, or of dubious utility, or 
+This file contains enclone options that are not in the online documentation for enclone.  They are
+here for a reason, for example, maybe they are buggy, or untested, or of dubious utility, or
 intended for debugging purposes only, or only work from a 10x server, etc.
 
 Use at your own risk, ha ha.
 
 Optional arguments controlling printing of join events:
+
 - SEQ: print sequences of contigs, before truncation to V..J
 - ANN: print annotations of contig
 - ANN0: print annotations of contig, after truncation to V..J
@@ -15,38 +16,45 @@ Optional arguments controlling printing of join events:
 - FAILS_ONLY: only print failed clonotypes.
 
 Optional arguments that control clonotype filtering:
+
 - WHITEF: only show clonotypes that exhibit whitelist contamination
 - PROTECT_BADS: don't delete putatively bad stuff
 - VDUP: only show clonotypes having two chains with the same V segment
 - HAVE_ONESIE: only show clonotypes including a onesie exact subclonotype
 
 Optional arguments that control miscellaneous logging:
+
 - DESCRIP: print sample descriptions, and also reference info
 - REUSE: print barcode reuse (that might arise from index hopping)
 - EXACT=n: print some data about exact subclonotype n.
 
 Optional arguments controlling logging for alternate alleles:
+
 - CON: print alternate consensus sequences
 - CON_TRACE: tracing for CON.
 
 Optional arguments that control the joining algorithm:
+
 - MIN_MULT: to document
 - MIN_ALT: to document
 - BCJOIN: substitute an algorithm that joins by barcode identity and ignores everything
           else -- only useful for studying two datasets from the same GEM well that used
           different enrichment protocols.
 
 Optional arguments governing input and output files:
+
 - EXFASTA=f: dump fasta file f for V..J exact subclonotypes.
 
 Optional arguments that control printing of individual clonotypes:
+
 - white   = percent of sequences implicated in whitelist expansion.
 - CHAIN_BRIEF: show abbreviated chain column headers
 - DEBUG_TABLE_PRINTING: add print lines to help debug printing of tables.
-- NOTE_SIMPLE: note if the first sequence for the chain is simple, in the sense that it exactly 
+- NOTE_SIMPLE: note if the first sequence for the chain is simple, in the sense that it exactly
                equals the concatenation of the right-truncated V with the full J segment.
 
 Other optional arguments:
+
 - FORCE: make joins even if redundant
 - EXP: exploratory code for exact clonotyping on
 - WEAK: for EXP, print all and show weaks
@@ -67,47 +75,48 @@ Other optional arguments:
 - DUMP_INTERNAL_IDS: special option to dump the list of internal ids and exit.
 - TOY: toy with phylogeny
 
-If an argument is a nonnegative integer, it is treated as if specified by TCR= or BCR=, but 
-without setting one of them.  Nonnegative integers are recognized as 10x dataset identifiers 
-("sample IDs") and their location is automatically found.  Inclusive ranges l1-l2 are 
+If an argument is a nonnegative integer, it is treated as if specified by TCR= or BCR=, but
+without setting one of them.  Nonnegative integers are recognized as 10x dataset identifiers
+("sample IDs") and their location is automatically found.  Inclusive ranges l1-l2 are
 expanded out.
 
 CELLRANGER: for use if called from cellranger -- changes failure message and prevents exit
             upon normal completion
 
 EXT=filename:
-Given output of an external clonotyping algorithm which took as inputs the pipeline outputs 
-for the lenas in enclone.testdata, for each exact subclonotype found by enclone, report its 
+Given output of an external clonotyping algorithm which took as inputs the pipeline outputs
+for the lenas in enclone.testdata, for each exact subclonotype found by enclone, report its
 composition in the external clonotyping, as clonotype_id[count], ...
 The input file should have lines of the form:
 sample barcode clonotype_id.
 
-SUMMARY_CLEAN: if SUMMARY specified, don't show computational performance stats, so 
+SUMMARY_CLEAN: if SUMMARY specified, don't show computational performance stats, so
 we can regress on output
 
 ACCEPT_INCONSISTENT: temporary option to accept inconsistent reference/annotations and feature refs
 
 CURRENT_REF: temp option to force CR 4.0 reference
 
 H5_SLICE: read GEX data slice by slice rather than preloading
-H5: force use of H5, even if feature_barcode_matrix.bin has been built
 
 FORCE_EXTERNAL: do not test for internal run
 
 STABLE_DOC: print documentation in a way that it won't change when the git version changes
 
 LVARS entries
-* entropy = for each cell, take the sum over all genes of -q/log2(q), where q is the fraction of 
+
+- entropy = for each cell, take the sum over all genes of -q/log2(q), where q is the fraction of
   total gene UMIs in the given gene; for an exact subclonotype report the median of these values
-* clust = cluster id, from analysis_csv/clustering/graphclust/clusters.csv
+- clust = cluster id, from analysis_csv/clustering/graphclust/clusters.csv
   (for now, has to be in that exact place; not implemented for PCELL)
-* type = cell type, from analysis_csv/celltypes/celltypes.csv
+- type = cell type, from analysis_csv/celltypes/celltypes.csv
   Note that it's not clear how doublets are handled.
-* pe<n> = PCA equivalence class at distance ≤ n, where n is an integer: it shows the equivalence
+- pe<n> = PCA equivalence class at distance ≤ n, where n is an integer: it shows the equivalence
   class on all cells in a single clonotype when two cells are called equivalent if their distance
   in PCA space is ≤ n.  This is only implemented for PER_CELL.
   This uses the file analysis_csv/pca/10_components/projection.csv.
   WARNING: this only makes sense if you have one dataset--this condition is not checked.
+
 - npe<n> = total number of cells in this clonotype that are within PCA distance n of this cell
 - ppe<n> = percent of all gex cells within PCA distance n of this cell that are in this clonotype
 
@@ -118,6 +127,7 @@ IMGT: specify the IMGT reference, forces RE so slow
 IMGT_FIX: make certain hardcoded fixes to the IMGT reference.
 
 SVG: render output as svg, clonotype by clonotype.  There are problems with this:
+
 1. Safari does not correctly render these svg files.
 2. The svg files use the Menlo font, which works on a Mac but is not universally available.
 3. We don't render background, so this is broken for PER_CELL.
@@ -127,6 +137,7 @@ TRACE_BARCODE: print some additional information for this particular barcode
 SUMMARY_CSV: nascent option to emit some summary information as CSV output
 
 A complex of related experimental stuff:
+
 - BASELINE: show stats relevant to NUMI and the the following two options
 - UMI_FILT_MARK: mark cells based on these stats; view with undocumented lvar "mark"
   and only viewable with PER_CELL
@@ -142,8 +153,9 @@ ECHOC: echo command, preceeded by "# ", and don't emit blank line
 ---------------------------------------------------------------------------------------------------
 
 MARK_STATS: generate three statistics (assumes GEX data included):
-1. number of dubious cells, where dubious means not classified as a B cell, or in a clonotype for 
-   which the ratio of the top dataset cells to the runner up dataset cells is at least 10 
+
+1. number of dubious cells, where dubious means not classified as a B cell, or in a clonotype for
+   which the ratio of the top dataset cells to the runner up dataset cells is at least 10
    (e.g. 3:0 works as does 10:1)
 2. number of cells that are marked (in selected clonotypes)
 3. of these, number that are "good", defined as
@@ -152,16 +164,17 @@ MARK_STATS: generate three statistics (assumes GEX data included):
    - and which have 2 or 3 chains.
 
 MARK_STATS2: generate four statistics (assumes GEX data included):
+
 1. number of fake expanded clonotype cells, defined as all but one cell in clonotypes for
    which the ratio of the top dataset cells to the runner up dataset cells is at least 10
 2. number of these cells that are marked by the filter (ADVANTAGE)
-3. number of good expanded clonotype cells, defined as cells in clonotypes having at least 10 
+3. number of good expanded clonotype cells, defined as cells in clonotypes having at least 10
    cells, called a B cell by GEX, having 2 or 3 chains, and not fake as above
 2. number of these cells that are marked by the filter (DISADVANTAGE)
 [The motivation for this, as distinct from MARK_STATS, is that (1) now consists of just cells in
 fake expanded clonotypes, and not just all dubious cells, would could be a substantially larger
 set.  This allows the analysis of the effect of a filter on fake expanded clonotypes.]
-*** THIS SHOULD BE RUN WITH NGEX AND NCROSS. ***
+***THIS SHOULD BE RUN WITH NGEX AND NCROSS.***
 
 ---------------------------------------------------------------------------------------------------
 
@@ -185,7 +198,7 @@ NALL_GEX: turn off all filters except GEX filter
 
 REPROD: accept nonproductive contigs, recompute productive, then reject nonproductive
 
-SPLIT_COMMAND: assuming that BCR and GEX have been provided with multiple entries each, split out 
+SPLIT_COMMAND: assuming that BCR and GEX have been provided with multiple entries each, split out
 separate commands having them specified one by one, and run them, then exit
 
 OPTIONS THAT ALLOW METRIC VALUES TO BE LOCKED
@@ -228,26 +241,28 @@ All three classes of UMIs are capped at 20 UMIs.
 ===================================================================================================
 
 CLONOTYPE_GROUP_NAMES=filename
-* The file should be a a CSV file with that includes fields new_group_name and group_id.  This is
+
+- The file should be a a CSV file with that includes fields new_group_name and group_id.  This is
   confusing nomenclature, see what follows.
-* The group_id should be obtained by running enclone WITHOUT grouping, so that there is exactly 
+- The group_id should be obtained by running enclone WITHOUT grouping, so that there is exactly
   one clonotype per group.
-* The new_group_name is the name of a group (of your own concoction) that you are assigning a 
+- The new_group_name is the name of a group (of your own concoction) that you are assigning a
   clonotype to.
-* A clonotype may not be assigned to more than one group.
-* The idea is that one would run enclone once to gather information that would
+- A clonotype may not be assigned to more than one group.
+- The idea is that one would run enclone once to gather information that would
   allow grouping of clonotypes.  Then run it as second time with this argument.
-* The argument only does something if honeycomb plotting is used.  In that case, the honeycomb
+- The argument only does something if honeycomb plotting is used.  In that case, the honeycomb
   plot is split by group and each group is given a different (light) background color.
-* Clonotypes that are not assigned to a group are not displayed.
+- Clonotypes that are not assigned to a group are not displayed.
 
 ===================================================================================================
 
 read utilization - This is a stat generated with the SUMMARY option, in internal mode.  It is the
 fraction of reads that are assigned to chains by enclone.  It has the following issues:
+
 1. It doesn't account for reads that are lost because of capping in assembly.  Internally, we
 can work around this by removing the capping on a cellranger branch.  (Done now.)
-2. It would not be correct if read one was significantly longer than 28 bases, so that it could 
+2. It would not be correct if read one was significantly longer than 28 bases, so that it could
 contribute to assembly.
 
 NO_UNCAP_SIM: turn off uncapping simulation
@@ -263,9 +278,6 @@ This won't work with paging, however you can pipe to "less -r".
 
 ROW_FILL_VERBOSE: special option for debugging
 
-TOP_GENES: list genes having the highest expression in the selected clonotypes, taking the median 
-across all cells (only implemented if .bin file has been generated using NH5)
-
 COMPE: COMP, plus enforce no unaccounted time (except up to 0.02 seconds)
 UNACCOUNTED: show unaccounted time at each step
 EVIL_EYE: print logging to facilitate diagnosis of mysterious hanging
@@ -274,10 +286,11 @@ TOY_COM: compute clonotypes, then act as toy server, which can speak to enclone_
 
 TOOLTIP: add tooltip text to honeycomb plots
 
-ALIGN_JUN_ALIGN_CONSISTENCY: test to see if they give consistent results 
+ALIGN_JUN_ALIGN_CONSISTENCY: test to see if they give consistent results
 (run with ALIGN1 JUN_ALIGN1 PLAIN)
 
 junction region alignment penalties accessible from the command line:
+
 - JSCORE_MATCH
 - JSCORE_MISMATCH
 - JSCORE_GAP_EXTEND
@@ -286,6 +299,7 @@ junction region alignment penalties accessible from the command line:
 
 lvars fb<n> and fb<n>_n: for example, fb1 is the 15-base sequence of the most frequent feature
 barcode, and fb1_n is the number of UMIs for that feature barcode
+
 - only usable if there is just one dataset
 - the file feature_barcode_matrix_top.bin must have been generated
 
@@ -299,34 +313,11 @@ SUBSAMPLE: subsample barcodes at the indicated fraction; at present this is deli
 
 ===================================================================================================
 
-ALL_BC=filename,field1,...,fieldn
-
-Dump a CSV file with fields
-
-    dataset,barcode,field1,...,fieldn
-
-to filename, with one line for each barcode.  All barcodes in the data files are included, whether
-or not they correspond to VDJ cells.  This would correspond (at least approximately) to all
-whitelisted barcodes in the raw data.
-
-Only certain other fields are allowed:
-1. feature variables, e.g. CDR3_ab, representing the UMI count
-2. gex = total gene expression UMI count
-3. type = the cell type
-4. clust = cluster id, from analysis_csv/clustering/graphclust/clusters.csv
-5. cell = vdj or gex or gex_vdj or empty.
-
-This may also be used with VAR_DEF.
-
-ALL_BCH: same but human readable instead of CSV
-
-===================================================================================================
-
 PRE_EVAL: evaluate sensitivity and specificity before calling print_clonotypes
 
 JOIN_BASIC_H=n: use a special join heuristic as follows (for demonstration purposes only)
-    same heavy chain V, same heavy chain J, 
-    same heavy chain CDR3 length, 
+    same heavy chain V, same heavy chain J,
+    same heavy chain CDR3 length,
     n% identity on nucleotides within heavy chain CDR3
 This automatically invokes PRE_EVAL because otherwise it would crash.
 This option is particularly slow because it forces more comparisons in the join step.
@@ -338,8 +329,9 @@ EXTERNAL_REF: if you set this to a IMGT reference fasta file, this will compare
 generated donor reference to it, do some analyses, and exit
 
 To generate the current IMGT reference within the 10x codebase, do something like this:
+
 1. bazel build //:pd //:shimulate //:devpipes_env
-2. cellranger/bazel-bin/devpipes_env.sh python cellranger/lib/bin/fetch_imgt_lib.py 
+2. cellranger/bazel-bin/devpipes_env.sh python cellranger/lib/bin/fetch_imgt_lib.py
    --genome vdj_IMGT_human --species "Homo sapiens"
 
 ===================================================================================================

diff --git a/enclone_args/Cargo.toml b/enclone_args/Cargo.toml
@@ -30,7 +30,6 @@ evalexpr = ">=7, <12"
 expr_tools = { path = "../expr_tools" }
 io_utils = { path = "../io_utils" }
 itertools.workspace = true
-mirror_sparse_matrix = { path = "../mirror_sparse_matrix" }
 rand = "0.8"
 rayon = "1"
 regex = { version = "1", default-features = false, features = ["std", "perf"] }

diff --git a/enclone_args/src/load_gex.rs b/enclone_args/src/load_gex.rs
@@ -7,7 +7,6 @@ use crate::load_gex_core::load_gex;
 use enclone_core::defs::{EncloneControl, GexInfo};
 
 use hdf5::Dataset;
-use mirror_sparse_matrix::MirrorSparseMatrix;
 use rayon::prelude::*;
 use std::fmt::Write;
 use std::{collections::HashMap, time::Instant};
@@ -20,11 +19,6 @@ use vector_utils::{bin_position, unique_sort};
 pub fn get_gex_info(ctl: &mut EncloneControl) -> Result<GexInfo, String> {
     let mut gex_features = Vec::<Vec<String>>::new();
     let mut gex_barcodes = Vec::<Vec<String>>::new();
-    let mut gex_matrices = Vec::<MirrorSparseMatrix>::new();
-    let mut fb_top_barcodes = Vec::<Vec<String>>::new();
-    let mut fb_top_matrices = Vec::<MirrorSparseMatrix>::new();
-    let mut fb_top_reads_barcodes = Vec::<Vec<String>>::new();
-    let mut fb_top_reads_matrices = Vec::<MirrorSparseMatrix>::new();
     let mut fb_total_umis = Vec::<u64>::new();
     let mut fb_total_reads = Vec::<u64>::new();
     let mut fb_brn = Vec::<Vec<(String, u32, u32)>>::new();
@@ -48,11 +42,6 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result<GexInfo, String> {
         ctl,
         &mut gex_features,
         &mut gex_barcodes,
-        &mut gex_matrices,
-        &mut fb_top_barcodes,
-        &mut fb_top_matrices,
-        &mut fb_top_reads_barcodes,
-        &mut fb_top_reads_matrices,
         &mut fb_total_umis,
         &mut fb_total_reads,
         &mut fb_brn,
@@ -163,11 +152,6 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result<GexInfo, String> {
     Ok(GexInfo {
         gex_features,
         gex_barcodes,
-        gex_matrices,
-        fb_top_barcodes,
-        fb_top_matrices,
-        fb_top_reads_barcodes,
-        fb_top_reads_matrices,
         fb_total_umis,
         fb_total_reads,
         fb_brn,