From e8d598d86cf22964ba6220d31b8e22c6544869e4 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Fri, 1 Mar 2024 12:49:57 -0800
Subject: [PATCH 01/15] Replace anonymous tuples in JSON parsing with structs.

---
 enclone_args/src/read_json.rs | 164 ++++++++++++++--------------------
 1 file changed, 65 insertions(+), 99 deletions(-)
diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index b5a376765..6e47c3451 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -676,7 +676,17 @@ fn parse_vector_entry_from_json(
 // this code.  Another would be to write out a binary version of the JSON file that contains
 // only the information that we need.
 
-pub fn read_json(
+#[derive(Default)]
+struct ReadJsonResult {
+    vdj_cells: Vec<String>,
+    gex_cells: Vec<String>,
+    gex_cells_specified: bool,
+    cr_version: String,
+    tigs: Vec<TigData>,
+    err: String,
+}
+
+fn read_json(
     accept_inconsistent: bool,
     origin_info: &OriginInfo,
     li: usize,
@@ -731,29 +741,13 @@ pub fn read_json(
             }
         }
     }
-    let mut results = Vec::<(
-        usize,
-        Vec<String>,
-        Vec<String>,
-        bool,
-        String,
-        Vec<TigData>,
-        String,
-    )>::new();
+    let mut results = Vec::<(usize, ReadJsonResult)>::new();
     for i in 0..xs.len() {
-        results.push((
-            i,
-            Vec::<String>::new(),
-            Vec::<String>::new(),
-            false,
-            String::new(),
-            Vec::<TigData>::new(),
-            String::new(),
-        ));
+        results.push((i, Default::default()));
     }
     let exiting = AtomicBool::new(false);
-    results.par_iter_mut().for_each(|res| {
-        let i = res.0;
+    results.par_iter_mut().for_each(|(i, res)| {
+        let i = *i;
         let resx = parse_vector_entry_from_json(
             &xs[i],
             json,
@@ -764,32 +758,32 @@ pub fn read_json(
             to_ref_index,
             reannotate,
             ctl,
-            &mut res.1,
-            &mut res.2,
-            &mut res.3,
-            &mut res.4,
-            &mut res.5,
+            &mut res.vdj_cells,
+            &mut res.gex_cells,
+            &mut res.gex_cells_specified,
+            &mut res.cr_version,
+            &mut res.tigs,
             &exiting,
         );
         if let Err(resx) = resx {
-            res.6 = resx;
+            res.err = resx;
         }
     });
-    for result in &results {
-        if !result.6.is_empty() {
-            return Err(result.6.clone());
+    for (_, result) in &results {
+        if !result.err.is_empty() {
+            return Err(result.err.clone());
         }
     }
-    for result in results.iter_mut().take(xs.len()) {
-        vdj_cells.append(&mut result.1);
-        gex_cells.append(&mut result.2);
-        if result.3 {
+    for (_, result) in results.iter_mut().take(xs.len()) {
+        vdj_cells.append(&mut result.vdj_cells);
+        gex_cells.append(&mut result.gex_cells);
+        if result.gex_cells_specified {
             *gex_cells_specified = true;
         }
-        if !result.4.is_empty() {
-            *cr_version = result.4.clone();
+        if !result.cr_version.is_empty() {
+            *cr_version = result.cr_version.clone();
         }
-        tigs.append(&mut result.5);
+        tigs.append(&mut result.tigs);
     }
     unique_sort(gex_cells);
     let mut tig_bc = Vec::<Vec<TigData>>::new();
@@ -849,6 +843,15 @@ pub fn read_json(
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
 
 // Parse the JSON annotations file(s).
+#[derive(Default)]
+struct LoadResult {
+    tig_bc: Vec<Vec<TigData>>,
+    cr_version: String,
+    vdj_cells: Vec<String>,
+    gex_cells: Vec<String>,
+    gex_cells_specified: bool,
+    err: String,
+}
 
 pub fn parse_json_annotations_files(
     ctl: &EncloneControl,
@@ -860,30 +863,9 @@ pub fn parse_json_annotations_files(
     gex_cells_specified: &mut Vec<bool>,
     fate: &mut [HashMap<String, BarcodeFate>],
 ) -> Result<(), String> {
-    // (origin index, contig name, V..J length): (?)
-    let mut results = Vec::<(
-        usize,
-        Vec<(String, usize)>,
-        Vec<Vec<TigData>>,
-        Vec<Vec<u8>>, // logs
-        String,
-        Vec<String>,
-        Vec<String>,
-        bool,
-        String,
-    )>::new();
+    let mut results = Vec::<(usize, LoadResult)>::new();
     for i in 0..ctl.origin_info.dataset_path.len() {
-        results.push((
-            i,
-            Vec::<(String, usize)>::new(),
-            Vec::<Vec<TigData>>::new(),
-            Vec::<Vec<u8>>::new(),
-            String::new(),
-            Vec::<String>::new(),
-            Vec::<String>::new(),
-            false,
-            String::new(),
-        ));
+        results.push((i, Default::default()));
     }
     // Note: only tracking truncated seq and quals initially
     let ann = if !ctl.gen_opt.cellranger {
@@ -891,12 +873,12 @@ pub fn parse_json_annotations_files(
     } else {
         "contig_annotations.json"
     };
-    results.par_iter_mut().for_each(|res| {
-        let li = res.0;
+    results.par_iter_mut().for_each(|(li, res)| {
+        let li = *li;
         let json = format!("{}/{ann}", ctl.origin_info.dataset_path[li]);
         let json_lz4 = format!("{}/{ann}.lz4", ctl.origin_info.dataset_path[li]);
         if !path_exists(&json) && !path_exists(&json_lz4) {
-            res.8 = format!("\ncan't find {json} or {json_lz4}\n");
+            res.err = format!("\ncan't find {json} or {json_lz4}\n");
             return;
         }
         let resx = read_json(
@@ -907,41 +889,37 @@ pub fn parse_json_annotations_files(
             refdata,
             to_ref_index,
             ctl.gen_opt.reannotate,
-            &mut res.4,
+            &mut res.cr_version,
             ctl,
-            &mut res.5,
-            &mut res.6,
-            &mut res.7,
+            &mut res.vdj_cells,
+            &mut res.gex_cells,
+            &mut res.gex_cells_specified,
         );
         if let Ok(resx) = resx {
             let tig_bc: Vec<Vec<TigData>> = resx;
-            res.5.sort();
-            res.2 = tig_bc;
+            res.vdj_cells.sort();
+            res.tig_bc = tig_bc;
         } else {
-            res.8 = resx.err().unwrap();
+            res.err = resx.err().unwrap();
         }
     });
-    for result in &results {
-        if !result.8.is_empty() {
-            return Err(result.8.clone());
+    for (_, result) in &results {
+        if !result.err.is_empty() {
+            return Err(result.err.clone());
         }
     }
     let mut versions = Vec::<String>::new();
-    for i in 0..results.len() {
-        tig_bc.append(&mut results[i].2.clone());
-        // ctl.gen_opt.cr_version = results[i].4.clone();
-        if results[i].4.is_empty() {
+    for (i, mut result) in results {
+        tig_bc.append(&mut result.tig_bc);
+        if result.cr_version.is_empty() {
             versions.push("≤3.1".to_string());
         } else {
-            versions.push(results[i].4.clone());
+            versions.push(result.cr_version);
         }
-        vdj_cells.push(results[i].5.clone());
-        gex_cells.push(results[i].6.clone());
-        gex_cells_specified.push(results[i].7);
 
-        let cells = &results[i].5;
+        let cells = &result.vdj_cells;
         let mut found = vec![false; cells.len()];
-        let tigs = &results[i].2;
+        let tigs = result.tig_bc;
         for tig in tigs {
             let p = bin_position(cells, &tig[0].barcode);
             if p >= 0 {
@@ -953,22 +931,10 @@ pub fn parse_json_annotations_files(
                 fate[i].insert(cells[j].clone(), BarcodeFate::NonProductive);
             }
         }
+
+        vdj_cells.push(result.vdj_cells);
+        gex_cells.push(result.gex_cells);
+        gex_cells_specified.push(result.gex_cells_specified);
     }
-    /*
-    if !ctl.gen_opt.internal_run {
-        unique_sort(&mut versions);
-        if versions.len() > 1
-            && versions != vec!["4.0".to_string(), "4009.52.0-82-g2244c685a".to_string()]
-        {
-            let args: Vec<String> = env::args().collect();
-            return Err(format!(
-                "\nYou're using output from multiple Cell Ranger versions = {},\n\
-                 which is not allowed.  Your command was:\n{}\n",
-                versions.iter().format(", "),
-                args.iter().format(","),
-            ));
-        }
-    }
-    */
     Ok(())
 }

From 57def9be8232834e6879c094bdace9465ffce129 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Fri, 1 Mar 2024 14:23:55 -0800
Subject: [PATCH 02/15] Invert data control flow in json reading.

---
 enclone_args/src/read_json.rs | 273 ++++++++++++++++------------------
 1 file changed, 127 insertions(+), 146 deletions(-)

diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index 6e47c3451..eaf595c89 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -29,6 +29,7 @@ use debruijn::dna_string::DnaString;
 use enclone_core::barcode_fate::BarcodeFate;
 use enclone_core::defs::{EncloneControl, OriginInfo, TigData};
 use io_utils::{open_maybe_compressed, path_exists, read_vector_entry_from_json};
+use itertools::Itertools;
 use rand::Rng;
 use rayon::prelude::*;
 use serde_json::Value;
@@ -93,6 +94,15 @@ fn json_error(
 
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
 
+#[derive(Default)]
+struct JsonParseResult {
+    vdj_cell: Option<String>,
+    gex_cell: Option<String>,
+    gex_cells_specified: bool,
+    cr_version: String,
+    tig: Option<TigData>,
+}
+
 fn parse_vector_entry_from_json(
     x: &[u8],
     json: &str,
@@ -103,13 +113,9 @@ fn parse_vector_entry_from_json(
     to_ref_index: &HashMap<usize, usize>,
     reannotate: bool,
     ctl: &EncloneControl,
-    vdj_cells: &mut Vec<String>,
-    gex_cells: &mut Vec<String>,
-    gex_cells_specified: &mut bool,
-    cr_version: &mut String,
-    tigs: &mut Vec<TigData>,
     exiting: &AtomicBool,
-) -> Result<(), String> {
+) -> Result<JsonParseResult, String> {
+    let mut res: JsonParseResult = Default::default();
     let v: Value = match serde_json::from_slice(x) {
         Err(_) => {
             return Err(format!(
@@ -132,29 +138,28 @@ fn parse_vector_entry_from_json(
         is_cell = true;
     }
 
-    let is_gex_cell = v["is_gex_cell"].as_bool();
-    if is_gex_cell.is_some() {
-        *gex_cells_specified = true;
-    }
-    if is_gex_cell == Some(true) {
-        gex_cells.push(barcode.clone());
+    if let Some(is_gex_cell) = v["is_gex_cell"].as_bool() {
+        res.gex_cells_specified = true;
+        if is_gex_cell {
+            res.gex_cell = Some(barcode.clone());
+        }
     }
 
     if !ctl.gen_opt.ncell && !is_cell {
-        return Ok(());
+        return Ok(res);
     }
     if is_cell {
-        vdj_cells.push(barcode.clone());
+        res.vdj_cell = Some(barcode.clone());
     }
 
     // Proceed.
 
     if !ctl.gen_opt.reprod && !v["productive"].as_bool().unwrap_or(false) {
-        return Ok(());
+        return Ok(res);
     }
     if !ctl.gen_opt.reprod && !ctl.gen_opt.ncell && !v["high_confidence"].as_bool().unwrap_or(false)
     {
-        return Ok(());
+        return Ok(res);
     }
     let tigname = v["contig_name"].to_string().between("\"", "\"").to_string();
     let full_seq = &v["sequence"].to_string().between("\"", "\"").to_string();
@@ -175,8 +180,8 @@ fn parse_vector_entry_from_json(
     let mut cdr3_aa: String;
     let mut cdr3_dna: String;
     let mut cdr3_start: usize;
-    if v.get("version").is_some() {
-        *cr_version = v["version"].to_string().between("\"", "\"").to_string();
+    if let Some(version) = v.get("version") {
+        res.cr_version = version.to_string().between("\"", "\"").to_string();
     }
 
     // Read validated and non-validated UMIs.
@@ -274,7 +279,7 @@ fn parse_vector_entry_from_json(
             ) {
                 print!("{}", strme(&log));
                 println!("invalid");
-                return Ok(());
+                return Ok(res);
             }
         } else if !is_valid(
             &x,
@@ -284,7 +289,7 @@ fn parse_vector_entry_from_json(
             &mut log,
             Some(ctl.gen_opt.gamma_delta),
         ) {
-            return Ok(());
+            return Ok(res);
         }
         let mut cdr3 = Vec::<(usize, Vec<u8>, usize, usize)>::new();
         get_cdr3_using_ann(&x, refdata, &ann, &mut cdr3);
@@ -435,7 +440,7 @@ fn parse_vector_entry_from_json(
             }
         }
         if v_ref_id == 1000000 {
-            return Ok(());
+            return Ok(res);
         }
 
         // Compute annv from cigarv.  We don't compute the mismatch entry.
@@ -495,14 +500,14 @@ fn parse_vector_entry_from_json(
         let x = DnaString::from_dna_string(full_seq);
         get_cdr3_using_ann(&x, refdata, &annv, &mut cdr3);
         if cdr3.is_empty() {
-            return Ok(());
+            return Ok(res);
         }
         let cdr3_aa_alt = stringme(&cdr3[0].1);
         if cdr3_aa != cdr3_aa_alt {
             // This is particularly pathological and rare:
 
             if tig_start as usize > cdr3[0].0 {
-                return Ok(());
+                return Ok(res);
             }
 
             // Define start.
@@ -525,10 +530,10 @@ fn parse_vector_entry_from_json(
     // It is not known if these correspond to bugs in cellranger that were subsequently fixed.
 
     if cdr3_aa.contains('*') {
-        return Ok(());
+        return Ok(res);
     }
     if cdr3_start + 3 * cdr3_aa.len() > tig_stop as usize - tig_start as usize {
-        return Ok(());
+        return Ok(res);
     }
 
     // Keep going.
@@ -606,7 +611,7 @@ fn parse_vector_entry_from_json(
     if invalidated_umis_present {
         invalu = Some(invalidated_umis);
     }
-    tigs.push(TigData {
+    res.tig = Some(TigData {
         cdr3_dna,
         len: seq.len(),
         v_start: tig_start,
@@ -648,7 +653,7 @@ fn parse_vector_entry_from_json(
         invalidated_umis: invalu,
         frac_reads_used,
     });
-    Ok(())
+    Ok(res)
 }
 
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
@@ -682,8 +687,7 @@ struct ReadJsonResult {
     gex_cells: Vec<String>,
     gex_cells_specified: bool,
     cr_version: String,
-    tigs: Vec<TigData>,
-    err: String,
+    tig_bc: Vec<Vec<TigData>>,
 }
 
 fn read_json(
@@ -694,13 +698,8 @@ fn read_json(
     refdata: &RefData,
     to_ref_index: &HashMap<usize, usize>,
     reannotate: bool,
-    cr_version: &mut String,
     ctl: &EncloneControl,
-    vdj_cells: &mut Vec<String>,
-    gex_cells: &mut Vec<String>,
-    gex_cells_specified: &mut bool,
-) -> Result<Vec<Vec<TigData>>, String> {
-    *gex_cells_specified = false;
+) -> Result<ReadJsonResult, String> {
     let mut tigs = Vec::<TigData>::new();
     let mut jsonx = json.clone();
     if !path_exists(json) {
@@ -741,51 +740,48 @@ fn read_json(
             }
         }
     }
-    let mut results = Vec::<(usize, ReadJsonResult)>::new();
-    for i in 0..xs.len() {
-        results.push((i, Default::default()));
-    }
     let exiting = AtomicBool::new(false);
-    results.par_iter_mut().for_each(|(i, res)| {
-        let i = *i;
-        let resx = parse_vector_entry_from_json(
-            &xs[i],
-            json,
-            accept_inconsistent,
-            origin_info,
-            li,
-            refdata,
-            to_ref_index,
-            reannotate,
-            ctl,
-            &mut res.vdj_cells,
-            &mut res.gex_cells,
-            &mut res.gex_cells_specified,
-            &mut res.cr_version,
-            &mut res.tigs,
-            &exiting,
-        );
-        if let Err(resx) = resx {
-            res.err = resx;
-        }
-    });
-    for (_, result) in &results {
-        if !result.err.is_empty() {
-            return Err(result.err.clone());
+    let results: Vec<_> = xs
+        .par_iter()
+        .map(|entry| {
+            parse_vector_entry_from_json(
+                entry,
+                json,
+                accept_inconsistent,
+                origin_info,
+                li,
+                refdata,
+                to_ref_index,
+                reannotate,
+                ctl,
+                &exiting,
+            )
+        })
+        .collect::<Result<Vec<_>, String>>()?;
+
+    let mut tigs = Vec::new();
+    let mut vdj_cells = Vec::new();
+    let mut gex_cells = Vec::new();
+    let mut gex_cells_specified = false;
+    let mut cr_version = String::new();
+    for result in results {
+        if let Some(tig) = result.tig {
+            tigs.push(tig);
+        }
+        if let Some(c) = result.vdj_cell {
+            vdj_cells.push(c);
+        }
+        if let Some(c) = result.gex_cell {
+            gex_cells.push(c);
         }
-    }
-    for (_, result) in results.iter_mut().take(xs.len()) {
-        vdj_cells.append(&mut result.vdj_cells);
-        gex_cells.append(&mut result.gex_cells);
         if result.gex_cells_specified {
-            *gex_cells_specified = true;
+            gex_cells_specified = true;
         }
         if !result.cr_version.is_empty() {
-            *cr_version = result.cr_version.clone();
+            cr_version = result.cr_version.clone();
         }
-        tigs.append(&mut result.tigs);
     }
-    unique_sort(gex_cells);
+    unique_sort(&mut gex_cells);
     let mut tig_bc = Vec::<Vec<TigData>>::new();
     let mut r = 0;
     while r < tigs.len() {
@@ -806,7 +802,7 @@ fn read_json(
         }
         r = s;
     }
-    unique_sort(vdj_cells);
+    unique_sort(&mut vdj_cells);
 
     // Subsample.
 
@@ -820,106 +816,90 @@ fn read_json(
             if y < 1.0 - ctl.gen_opt.subsample {
                 *del = true;
                 let bc = &bc[0].barcode;
-                let p = bin_position(vdj_cells, bc);
+                let p = bin_position(&vdj_cells, bc);
                 if p >= 0 {
                     to_delete2[p as usize] = true;
                 }
-                let p = bin_position(gex_cells, bc);
+                let p = bin_position(&gex_cells, bc);
                 if p >= 0 {
                     to_delete3[p as usize] = true;
                 }
             }
         }
         erase_if(&mut tig_bc, &to_delete1);
-        erase_if(vdj_cells, &to_delete2);
-        erase_if(gex_cells, &to_delete3);
+        erase_if(&mut vdj_cells, &to_delete2);
+        erase_if(&mut gex_cells, &to_delete3);
     }
 
     // Done.
 
-    Ok(tig_bc)
+    Ok(ReadJsonResult {
+        vdj_cells,
+        gex_cells,
+        gex_cells_specified,
+        cr_version,
+        tig_bc,
+    })
 }
 
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
 
-// Parse the JSON annotations file(s).
-#[derive(Default)]
-struct LoadResult {
+pub struct Annotations {
+    vdj_cells: Vec<Vec<String>>,
+    gex_cells: Vec<Vec<String>>,
+    gex_cells_specified: Vec<bool>,
     tig_bc: Vec<Vec<TigData>>,
-    cr_version: String,
-    vdj_cells: Vec<String>,
-    gex_cells: Vec<String>,
-    gex_cells_specified: bool,
-    err: String,
+    fate: Vec<HashMap<String, BarcodeFate>>,
 }
 
 pub fn parse_json_annotations_files(
     ctl: &EncloneControl,
-    tig_bc: &mut Vec<Vec<TigData>>,
     refdata: &RefData,
     to_ref_index: &HashMap<usize, usize>,
-    vdj_cells: &mut Vec<Vec<String>>,
-    gex_cells: &mut Vec<Vec<String>>,
-    gex_cells_specified: &mut Vec<bool>,
-    fate: &mut [HashMap<String, BarcodeFate>],
-) -> Result<(), String> {
-    let mut results = Vec::<(usize, LoadResult)>::new();
-    for i in 0..ctl.origin_info.dataset_path.len() {
-        results.push((i, Default::default()));
-    }
+) -> Result<Annotations, String> {
     // Note: only tracking truncated seq and quals initially
     let ann = if !ctl.gen_opt.cellranger {
         "all_contig_annotations.json"
     } else {
         "contig_annotations.json"
     };
-    results.par_iter_mut().for_each(|(li, res)| {
-        let li = *li;
-        let json = format!("{}/{ann}", ctl.origin_info.dataset_path[li]);
-        let json_lz4 = format!("{}/{ann}.lz4", ctl.origin_info.dataset_path[li]);
-        if !path_exists(&json) && !path_exists(&json_lz4) {
-            res.err = format!("\ncan't find {json} or {json_lz4}\n");
-            return;
-        }
-        let resx = read_json(
-            ctl.gen_opt.accept_inconsistent,
-            &ctl.origin_info,
-            li,
-            &json,
-            refdata,
-            to_ref_index,
-            ctl.gen_opt.reannotate,
-            &mut res.cr_version,
-            ctl,
-            &mut res.vdj_cells,
-            &mut res.gex_cells,
-            &mut res.gex_cells_specified,
-        );
-        if let Ok(resx) = resx {
-            let tig_bc: Vec<Vec<TigData>> = resx;
-            res.vdj_cells.sort();
-            res.tig_bc = tig_bc;
-        } else {
-            res.err = resx.err().unwrap();
-        }
-    });
-    for (_, result) in &results {
-        if !result.err.is_empty() {
-            return Err(result.err.clone());
-        }
-    }
-    let mut versions = Vec::<String>::new();
-    for (i, mut result) in results {
-        tig_bc.append(&mut result.tig_bc);
-        if result.cr_version.is_empty() {
-            versions.push("≤3.1".to_string());
-        } else {
-            versions.push(result.cr_version);
-        }
+    let results = ctl
+        .origin_info
+        .dataset_path
+        .par_iter()
+        .enumerate()
+        .map(|(li, dataset_path)| {
+            let json = format!("{}/{ann}", dataset_path);
+            let json_lz4 = format!("{}/{ann}.lz4", dataset_path);
+            if !path_exists(&json) && !path_exists(&json_lz4) {
+                return Err(format!("\ncan't find {json} or {json_lz4}\n"));
+            }
+            read_json(
+                ctl.gen_opt.accept_inconsistent,
+                &ctl.origin_info,
+                li,
+                &json,
+                refdata,
+                to_ref_index,
+                ctl.gen_opt.reannotate,
+                ctl,
+            )
+            .map(|r| (li, r))
+        })
+        .collect::<Result<Vec<_>, String>>()?;
+
+    let mut ann = Annotations {
+        tig_bc: Default::default(),
+        vdj_cells: Default::default(),
+        gex_cells: Default::default(),
+        gex_cells_specified: Default::default(),
+        fate: vec![HashMap::<String, BarcodeFate>::new(); ctl.origin_info.n()],
+    };
 
+    for (i, result) in results {
         let cells = &result.vdj_cells;
         let mut found = vec![false; cells.len()];
-        let tigs = result.tig_bc;
+        let tigs = &result.tig_bc;
         for tig in tigs {
             let p = bin_position(cells, &tig[0].barcode);
             if p >= 0 {
@@ -928,13 +908,14 @@ pub fn parse_json_annotations_files(
         }
         for j in 0..found.len() {
             if !found[j] {
-                fate[i].insert(cells[j].clone(), BarcodeFate::NonProductive);
+                ann.fate[i].insert(cells[j].clone(), BarcodeFate::NonProductive);
             }
         }
 
-        vdj_cells.push(result.vdj_cells);
-        gex_cells.push(result.gex_cells);
-        gex_cells_specified.push(result.gex_cells_specified);
+        ann.tig_bc.extend(result.tig_bc.into_iter());
+        ann.vdj_cells.push(result.vdj_cells);
+        ann.gex_cells.push(result.gex_cells);
+        ann.gex_cells_specified.push(result.gex_cells_specified);
     }
-    Ok(())
+    Ok(ann)
 }

From 8c21c0bc39f21de7358ea5b6c27cb5c1509a468a Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Fri, 1 Mar 2024 14:43:38 -0800
Subject: [PATCH 03/15] Finish cleaning up the JSON loading code.

---
 enclone_args/src/read_json.rs | 108 ++++++++++++++--------------------
 enclone_stuff/src/start.rs    |  25 +++-----
 2 files changed, 54 insertions(+), 79 deletions(-)

diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index eaf595c89..01c7c651d 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -29,12 +29,10 @@ use debruijn::dna_string::DnaString;
 use enclone_core::barcode_fate::BarcodeFate;
 use enclone_core::defs::{EncloneControl, OriginInfo, TigData};
 use io_utils::{open_maybe_compressed, path_exists, read_vector_entry_from_json};
-use itertools::Itertools;
 use rand::Rng;
 use rayon::prelude::*;
 use serde_json::Value;
 use std::fmt::Write;
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::{collections::HashMap, io::BufReader};
 use string_utils::{stringme, strme, TextUtils};
 use vdj_ann::{annotate, refx, transcript};
@@ -42,54 +40,44 @@ use vector_utils::{bin_position, erase_if, unique_sort};
 
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
 
-fn json_error(
-    json: Option<&str>,
-    ctl: &EncloneControl,
-    exiting: &AtomicBool,
-    msg: &str,
-) -> Result<(), String> {
-    // The following line prevents error messages from this function from being
-    // printed multiple times.
-    let mut msgx = String::new();
-    if !exiting.swap(true, Ordering::Relaxed) {
-        msgx = "\nThere is something wrong with the contig annotations in the cellranger output \
-             file"
+fn json_error(json: Option<&str>, internal_run: bool, msg: &str) -> String {
+    let mut msgx =
+        "There is something wrong with the contig annotations in the cellranger output file"
             .to_string();
-        if json.is_some() {
-            write!(msgx, "\n{}.", json.unwrap()).unwrap();
-        } else {
-            msgx += ".";
-        }
-        if ctl.gen_opt.internal_run {
-            writeln!(msgx, "\n\npossibly relevant internal data: {msg}").unwrap();
-        }
-        if ctl.gen_opt.internal_run {
-            msgx += "\n\nATTENTION INTERNAL 10X USERS!\n\
-                Quite possibly you are using data from a cellranger run carried out using a \
-                version\n\
-                between 3.1 and 4.0.  For certain of these versions, it is necessary to add the\n\
-                argument CURRENT_REF to your command line.  If that doesn't work, \
-                please see below.\n";
-        }
-        msgx += "\n\nHere is what you should do:\n\n\
-             1. If you used cellranger version ≥ 4.0, the problem is very likely\n\
-                that the directory outs/vdj_reference was not retained, so enclone\n\
-                didn't see it, and had to guess what the reference sequence was.\n\
-                Fix this and everything should be fine.\n\n\
-             2. If you used cellranger version 3.1, then you need to add a command-line\n\
-                argument REF=<vdj_reference_fasta_file_name>, or if you already did that,\n\
-                make sure it is the *same* as that which you gave cellranger.\n\n\
-             3. If you used cellranger version < 3.1 (the only other possibility), then\n\
-                you have options:\n\
-                • rerun cellranger using the current version\n\
-                • or provide an argument REF= as above and RE to force reannotation\n\
-                • or provide the argument BUILT_IN to use the current reference and force\n  \
-                  reannotation (and MOUSE if you used mouse); only works with human and mouse.\n\n\
-             Note that one way to get the error is to specify TCR when you meant BCR, or the\n\
-             other way.\n\n\
-             If you're stuck, please write to us at enclone@10xgenomics.com.\n";
-    }
-    Err(msgx)
+    if let Some(json) = json {
+        write!(msgx, "\n{}.", json).unwrap();
+    } else {
+        msgx += ".";
+    }
+    if internal_run {
+        writeln!(msgx, "\n\npossibly relevant internal data: {msg}").unwrap();
+
+        msgx += "\n\nATTENTION INTERNAL 10X USERS!\n\
+            Quite possibly you are using data from a cellranger run carried out using a \
+            version\n\
+            between 3.1 and 4.0.  For certain of these versions, it is necessary to add the\n\
+            argument CURRENT_REF to your command line.  If that doesn't work, \
+            please see below.\n";
+    }
+    msgx += "\n\nHere is what you should do:\n\n\
+        1. If you used cellranger version ≥ 4.0, the problem is very likely\n\
+        that the directory outs/vdj_reference was not retained, so enclone\n\
+        didn't see it, and had to guess what the reference sequence was.\n\
+        Fix this and everything should be fine.\n\n\
+        2. If you used cellranger version 3.1, then you need to add a command-line\n\
+        argument REF=<vdj_reference_fasta_file_name>, or if you already did that,\n\
+        make sure it is the *same* as that which you gave cellranger.\n\n\
+        3. If you used cellranger version < 3.1 (the only other possibility), then\n\
+        you have options:\n\
+        • rerun cellranger using the current version\n\
+        • or provide an argument REF= as above and RE to force reannotation\n\
+        • or provide the argument BUILT_IN to use the current reference and force\n  \
+            reannotation (and MOUSE if you used mouse); only works with human and mouse.\n\n\
+        Note that one way to get the error is to specify TCR when you meant BCR, or the\n\
+        other way.\n\n\
+        If you're stuck, please write to us at enclone@10xgenomics.com.\n";
+
+    msgx
 }
 
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
@@ -113,7 +101,6 @@ fn parse_vector_entry_from_json(
     to_ref_index: &HashMap<usize, usize>,
     reannotate: bool,
     ctl: &EncloneControl,
-    exiting: &AtomicBool,
 ) -> Result<JsonParseResult, String> {
     let mut res: JsonParseResult = Default::default();
     let v: Value = match serde_json::from_slice(x) {
@@ -374,10 +361,7 @@ fn parse_vector_entry_from_json(
                 .to_string()
                 .between("\"", "\"")
                 .to_string();
-            if refdata.name[feature_idx] != gene_name
-                && !accept_inconsistent
-                && !exiting.swap(true, Ordering::Relaxed)
-            {
+            if refdata.name[feature_idx] != gene_name && !accept_inconsistent {
                 return Err(format!(
                     "\nThere is an inconsistency between the reference \
                      file used to create the Cell Ranger output files in\n{}\nand the \
@@ -488,7 +472,7 @@ fn parse_vector_entry_from_json(
         let rt = &refdata.refs[v_ref_id];
         if annv.len() == 2 && annv[0].1 as usize > rt.len() {
             let msg = format!("annv[0].1 = {}, rt.len() = {}", annv[0].1, rt.len());
-            json_error(None, ctl, exiting, &msg)?;
+            return Err(json_error(None, ctl.gen_opt.internal_run, &msg));
         }
 
         // Check to see if the CDR3 sequence has changed.  This could happen if the cellranger
@@ -540,7 +524,7 @@ fn parse_vector_entry_from_json(
 
     if tig_start < 0 || tig_stop < 0 {
         let msg = format!("tig_start = {tig_start}, tig_stop = {tig_stop}");
-        json_error(Some(json), ctl, exiting, &msg)?;
+        return Err(json_error(Some(json), ctl.gen_opt.internal_run, &msg));
     }
     let (tig_start, tig_stop) = (tig_start as usize, tig_stop as usize);
     let quals0 = v["quals"].to_string();
@@ -740,7 +724,6 @@ fn read_json(
             }
         }
     }
-    let exiting = AtomicBool::new(false);
     let results: Vec<_> = xs
         .par_iter()
         .map(|entry| {
@@ -754,7 +737,6 @@ fn read_json(
                 to_ref_index,
                 reannotate,
                 ctl,
-                &exiting,
             )
         })
         .collect::<Result<Vec<_>, String>>()?;
@@ -845,11 +827,11 @@ fn read_json(
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
 
 pub struct Annotations {
-    vdj_cells: Vec<Vec<String>>,
-    gex_cells: Vec<Vec<String>>,
-    gex_cells_specified: Vec<bool>,
-    tig_bc: Vec<Vec<TigData>>,
-    fate: Vec<HashMap<String, BarcodeFate>>,
+    pub vdj_cells: Vec<Vec<String>>,
+    pub gex_cells: Vec<Vec<String>>,
+    pub gex_cells_specified: Vec<bool>,
+    pub tig_bc: Vec<Vec<TigData>>,
+    pub fate: Vec<HashMap<String, BarcodeFate>>,
 }
 
 pub fn parse_json_annotations_files(
diff --git a/enclone_stuff/src/start.rs b/enclone_stuff/src/start.rs
index b537b6c41..62c4784c0 100644
--- a/enclone_stuff/src/start.rs
+++ b/enclone_stuff/src/start.rs
@@ -18,7 +18,7 @@ use enclone::join::join_exacts;
 use enclone::misc1::{cross_filter, lookup_heavy_chain_reuse};
 use enclone::misc2::{check_for_barcode_reuse, find_exact_subclonotypes, search_for_shm_indels};
 use enclone::misc3::sort_tig_bc;
-use enclone_args::read_json::parse_json_annotations_files;
+use enclone_args::read_json::{parse_json_annotations_files, Annotations};
 use enclone_core::barcode_fate::BarcodeFate;
 use enclone_core::defs::{AlleleData, CloneInfo, TigData};
 use enclone_core::enclone_structs::{EncloneExacts, EncloneIntermediates, EncloneSetup};
@@ -121,21 +121,14 @@ pub fn main_enclone_start(setup: EncloneSetup) -> Result<EncloneIntermediates, S
     // Parse the json annotations file.
 
     let tparse = Instant::now();
-    let mut tig_bc = Vec::<Vec<TigData>>::new();
-    let mut vdj_cells = Vec::<Vec<String>>::new();
-    let mut gex_cells = Vec::<Vec<String>>::new();
-    let mut gex_cells_specified = Vec::<bool>::new();
-    let mut fate = vec![HashMap::<String, BarcodeFate>::new(); ctl.origin_info.n()];
-    parse_json_annotations_files(
-        ctl,
-        &mut tig_bc,
-        refdata,
-        to_ref_index,
-        &mut vdj_cells,
-        &mut gex_cells,
-        &mut gex_cells_specified,
-        &mut fate,
-    )?;
+
+    let Annotations {
+        mut tig_bc,
+        gex_cells,
+        gex_cells_specified,
+        vdj_cells,
+        mut fate,
+    } = parse_json_annotations_files(ctl, refdata, to_ref_index)?;
     ctl.perf_stats(&tparse, "loading from json");
 
     // Populate features.

From 082a674c31201aa4a57788101830027cb530d6d1 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Fri, 1 Mar 2024 14:46:38 -0800
Subject: [PATCH 04/15] unused var

---
 enclone_args/src/read_json.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index 01c7c651d..dea22b7bb 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -684,7 +684,6 @@ fn read_json(
     reannotate: bool,
     ctl: &EncloneControl,
 ) -> Result<ReadJsonResult, String> {
-    let mut tigs = Vec::<TigData>::new();
     let mut jsonx = json.clone();
     if !path_exists(json) {
         jsonx = format!("{json}.lz4");

From ddad6262077e3d32decd4355a57707bb64b3d207 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Fri, 1 Mar 2024 16:45:53 -0800
Subject: [PATCH 05/15] Finish refactoring JSON parsing to use the type we
 already have...

---
 Cargo.lock                    |   1 +
 enclone_args/Cargo.toml       |   1 +
 enclone_args/src/read_json.rs | 382 ++++++++++++----------------------
 3 files changed, 137 insertions(+), 247 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cdf177945..0aaf9b057 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -468,6 +468,7 @@ dependencies = [
  "serde_json",
  "string_utils",
  "vdj_ann",
+ "vdj_types",
  "vector_utils",
 ]
 
diff --git a/enclone_args/Cargo.toml b/enclone_args/Cargo.toml
index cc16c5d5c..92a392fab 100644
--- a/enclone_args/Cargo.toml
+++ b/enclone_args/Cargo.toml
@@ -36,6 +36,7 @@ regex = { version = "1", default-features = false, features = ["std", "perf"] }
 serde_json = "1"
 string_utils = { path = "../string_utils" }
 vdj_ann = { path = "../vdj_ann" }
+vdj_types = { path = "../vdj_types" }
 vector_utils = { path = "../vector_utils" }
 
 [target.'cfg(not(windows))'.dependencies.hdf5]
diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index dea22b7bb..745ce2f27 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -1,41 +1,20 @@
 // Copyright (c) 2021 10X Genomics, Inc. All rights reserved.
 
-// Fields that are used in all_contig_annotations.json:
-// • barcode
-// • is_cell and is_asm_cell -- both are optional, but at least one needs to be present and
-//   true for a cell called by the VDJ pipeline
-// • is_gex_cell -- optional
-// • productive -- optional but should be true for contigs to be used
-// • high_confidence -- optional but should be true for contigs to be used
-// • contig_name
-// • sequence
-// • version -- optional
-// • validated_umis -- optional
-// • non_validated_umis -- optional
-// • invalidated_umis -- optional
-// • fraction_of_reads_for_this_barcode_provided_as_input_to_assembly -- optional
-// • quals
-// • umi_count
-// • read_count
-// • cdr3, unless in reannotate mode
-// • cdr3_seq, unless in reannotate mode
-// • cdr3_start, unless in reannotate mode
-// • annotations, unless in reannotate mode.
-
 use self::annotate::{annotate_seq, get_cdr3_using_ann, print_some_annotations};
 use self::refx::RefData;
 use self::transcript::is_valid;
 use debruijn::dna_string::DnaString;
 use enclone_core::barcode_fate::BarcodeFate;
 use enclone_core::defs::{EncloneControl, OriginInfo, TigData};
-use io_utils::{open_maybe_compressed, path_exists, read_vector_entry_from_json};
+use io_utils::{open_maybe_compressed, path_exists};
 use rand::Rng;
 use rayon::prelude::*;
-use serde_json::Value;
+use std::collections::HashMap;
 use std::fmt::Write;
-use std::{collections::HashMap, io::BufReader};
 use string_utils::{stringme, strme, TextUtils};
+use vdj_ann::annotate::ContigAnnotation;
 use vdj_ann::{annotate, refx, transcript};
+use vdj_types::{VdjChain, VdjRegion};
 use vector_utils::{bin_position, erase_if, unique_sort};
 
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
@@ -87,48 +66,35 @@ struct JsonParseResult {
     vdj_cell: Option<String>,
     gex_cell: Option<String>,
     gex_cells_specified: bool,
-    cr_version: String,
     tig: Option<TigData>,
 }
 
 fn parse_vector_entry_from_json(
-    x: &[u8],
+    ann: ContigAnnotation,
     json: &str,
     accept_inconsistent: bool,
     origin_info: &OriginInfo,
-    li: usize,
+    dataset_index: usize,
     refdata: &RefData,
     to_ref_index: &HashMap<usize, usize>,
     reannotate: bool,
     ctl: &EncloneControl,
 ) -> Result<JsonParseResult, String> {
     let mut res: JsonParseResult = Default::default();
-    let v: Value = match serde_json::from_slice(x) {
-        Err(_) => {
-            return Err(format!(
-                "\nInternal error, failed to parse a value from a string.  The string is:\n{}\n",
-                strme(x)
-            ));
-        }
-        Ok(v) => v,
-    };
-    let barcode = v["barcode"].to_string().between("\"", "\"").to_string();
 
     // Get cell status.  Sometime after CR 4.0 was released, and before 4.1 was released,
     // we added new fields is_asm_cell and is_gex_cell to the json file.  The value of
     // is_asm_cell is the original determination of "cell" in the VDJ pipeline, whereas the
     // value of is_gex_cell is that for the GEX pipeline.
-
-    let mut is_cell = v["is_cell"].as_bool().unwrap_or(false);
-    let is_asm_cell = v["is_asm_cell"].as_bool().unwrap_or(false);
-    if is_asm_cell {
+    let mut is_cell = ann.is_cell;
+    if ann.is_asm_cell.is_some_and(|is_asm_cell| is_asm_cell) {
         is_cell = true;
     }
 
-    if let Some(is_gex_cell) = v["is_gex_cell"].as_bool() {
+    if let Some(is_gex_cell) = ann.is_gex_cell {
         res.gex_cells_specified = true;
         if is_gex_cell {
-            res.gex_cell = Some(barcode.clone());
+            res.gex_cell = Some(ann.barcode.clone());
         }
     }
 
@@ -136,20 +102,17 @@ fn parse_vector_entry_from_json(
         return Ok(res);
     }
     if is_cell {
-        res.vdj_cell = Some(barcode.clone());
+        res.vdj_cell = Some(ann.barcode.clone());
     }
 
     // Proceed.
 
-    if !ctl.gen_opt.reprod && !v["productive"].as_bool().unwrap_or(false) {
+    if !ctl.gen_opt.reprod && !ann.productive.unwrap_or(false) {
         return Ok(res);
     }
-    if !ctl.gen_opt.reprod && !ctl.gen_opt.ncell && !v["high_confidence"].as_bool().unwrap_or(false)
-    {
+    if !ctl.gen_opt.reprod && !ctl.gen_opt.ncell && !ann.high_confidence {
         return Ok(res);
     }
-    let tigname = v["contig_name"].to_string().between("\"", "\"").to_string();
-    let full_seq = &v["sequence"].to_string().between("\"", "\"").to_string();
     let mut left = false;
     let (mut v_ref_id, mut j_ref_id) = (1000000, 0);
     let mut d_ref_id: Option<usize> = None;
@@ -167,99 +130,62 @@ fn parse_vector_entry_from_json(
     let mut cdr3_aa: String;
     let mut cdr3_dna: String;
     let mut cdr3_start: usize;
-    if let Some(version) = v.get("version") {
-        res.cr_version = version.to_string().between("\"", "\"").to_string();
-    }
 
-    // Read validated and non-validated UMIs.
-
-    let mut validated_umis = Vec::<String>::new();
-    let mut validated_umis_present = false;
-    let val = v["validated_umis"].as_array();
-    if let Some(val) = val {
-        validated_umis_present = true;
-        for vi in val {
-            validated_umis.push(vi.to_string().between("\"", "\"").to_string());
-        }
-    }
-    let mut non_validated_umis = Vec::<String>::new();
-    let mut non_validated_umis_present = false;
-    let non_val = v["non_validated_umis"].as_array();
-    if let Some(non_val) = non_val {
-        non_validated_umis_present = true;
-        for nv in non_val {
-            non_validated_umis.push(nv.to_string().between("\"", "\"").to_string());
-        }
-    }
-    let mut invalidated_umis = Vec::<String>::new();
-    let mut invalidated_umis_present = false;
-    let inval = v["invalidated_umis"].as_array();
-    if let Some(inval) = inval {
-        invalidated_umis_present = true;
-        for inv in inval {
-            invalidated_umis.push(inv.to_string().between("\"", "\"").to_string());
-        }
-    }
-
-    // Read fraction_of_reads_for_this_barcode_provided_as_input_to_assembly.
-
-    let mut frac_reads_used = None;
-    let f = v["fraction_of_reads_for_this_barcode_provided_as_input_to_assembly"].as_f64();
-    if let Some(f) = f {
-        frac_reads_used = Some((f * 1_000_000.0).round() as u32);
-    }
+    let frac_reads_used = ann
+        .fraction_of_reads_for_this_barcode_provided_as_input_to_assembly
+        .map(|f| (f * 1_000_000.0).round() as u32);
 
     // Reannotate.
-
     if reannotate || ctl.gen_opt.reprod {
-        let x = DnaString::from_dna_string(full_seq);
-        let mut ann = Vec::<(i32, i32, i32, i32, i32)>::new();
-        annotate_seq(&x, refdata, &mut ann, true, false, true);
+        let x = DnaString::from_dna_string(&ann.sequence);
+        let mut ann1 = Vec::<(i32, i32, i32, i32, i32)>::new();
+        annotate_seq(&x, refdata, &mut ann1, true, false, true);
 
         // If there are multiple V segment alignments, possibly reduce to just one.
 
         let mut ann2 = Vec::<(i32, i32, i32, i32, i32)>::new();
         let mut j = 0;
-        while j < ann.len() {
-            let t = ann[j].2 as usize;
+        while j < ann1.len() {
+            let t = ann1[j].2 as usize;
             let mut k = j + 1;
-            while k < ann.len() {
-                if refdata.segtype[ann[k].2 as usize] != refdata.segtype[t] {
+            while k < ann1.len() {
+                if refdata.segtype[ann1[k].2 as usize] != refdata.segtype[t] {
                     break;
                 }
                 k += 1;
             }
             if refdata.segtype[t] == "V" && k - j > 1 {
                 let mut entries = 1;
-                if j < ann.len() - 1
-                    && ann[j + 1].2 as usize == t
-                    && ((ann[j].0 + ann[j].1 == ann[j + 1].0 && ann[j].3 + ann[j].1 < ann[j + 1].3)
-                        || (ann[j].0 + ann[j].1 < ann[j + 1].0
-                            && ann[j].3 + ann[j].1 == ann[j + 1].3))
+                if j < ann1.len() - 1
+                    && ann1[j + 1].2 as usize == t
+                    && ((ann1[j].0 + ann1[j].1 == ann1[j + 1].0
+                        && ann1[j].3 + ann1[j].1 < ann1[j + 1].3)
+                        || (ann1[j].0 + ann1[j].1 < ann1[j + 1].0
+                            && ann1[j].3 + ann1[j].1 == ann1[j + 1].3))
                 {
                     entries = 2;
                 }
-                ann2.extend(&ann[j..j + entries]);
+                ann2.extend(&ann1[j..j + entries]);
             } else {
-                ann2.extend(&ann[j..k]);
+                ann2.extend(&ann1[j..k]);
             }
             j = k;
         }
-        ann = ann2;
+        ann1 = ann2;
 
         // Proceed.
 
-        if ctl.gen_opt.trace_barcode == *barcode {
+        if ctl.gen_opt.trace_barcode == ann.barcode {
             let mut log = Vec::<u8>::new();
-            print_some_annotations(refdata, &ann, &mut log, false);
+            print_some_annotations(refdata, &ann1, &mut log, false);
             print!("\n{}", strme(&log));
         }
         let mut log = Vec::<u8>::new();
-        if ctl.gen_opt.trace_barcode == *barcode {
+        if ctl.gen_opt.trace_barcode == ann.barcode {
             if !is_valid(
                 &x,
                 refdata,
-                &ann,
+                &ann1,
                 true,
                 &mut log,
                 Some(ctl.gen_opt.gamma_delta),
@@ -271,7 +197,7 @@ fn parse_vector_entry_from_json(
         } else if !is_valid(
             &x,
             refdata,
-            &ann,
+            &ann1,
             false,
             &mut log,
             Some(ctl.gen_opt.gamma_delta),
@@ -279,14 +205,14 @@ fn parse_vector_entry_from_json(
             return Ok(res);
         }
         let mut cdr3 = Vec::<(usize, Vec<u8>, usize, usize)>::new();
-        get_cdr3_using_ann(&x, refdata, &ann, &mut cdr3);
+        get_cdr3_using_ann(&x, refdata, &ann1, &mut cdr3);
         cdr3_aa = stringme(&cdr3[0].1);
         cdr3_start = cdr3[0].0;
         cdr3_dna = x
             .slice(cdr3_start, cdr3_start + 3 * cdr3_aa.len())
             .to_string();
         let mut seen_j = false;
-        for anni in ann {
+        for anni in ann1 {
             let t = anni.2 as usize;
             if refdata.is_u(t) {
                 u_ref_id = Some(t);
@@ -305,7 +231,7 @@ fn parse_vector_entry_from_json(
                     if tig_start > cdr3_start as isize {
                         panic!(
                             "Something is wrong with the CDR3 start for this contig:\n\n{}.",
-                            &full_seq
+                            ann.sequence
                         );
                     }
                     cdr3_start -= tig_start as usize;
@@ -332,35 +258,31 @@ fn parse_vector_entry_from_json(
     } else {
         // Use annotations from json file.
 
-        cdr3_aa = v["cdr3"].to_string().between("\"", "\"").to_string();
-        cdr3_dna = v["cdr3_seq"].to_string().between("\"", "\"").to_string();
-        cdr3_start = v["cdr3_start"].as_u64().unwrap() as usize;
-        let ann = v["annotations"].as_array();
-        if ann.is_none() {
+        cdr3_aa = ann.cdr3.unwrap();
+        cdr3_dna = ann.cdr3_seq.unwrap();
+        cdr3_start = ann.cdr3_start.unwrap();
+        let annotations = ann.annotations;
+        if annotations.is_empty() {
             return Err(format!(
                 "\nThe file\n{json}\ndoes not contain annotations.  To use enclone with it, \
                     please specify the argument BUILT_IN\nto force use of the internal \
                     reference and recompute annotations.\n"
             ));
         }
-        let ann = ann.unwrap();
         let mut cigarv = String::new(); // cigar for V segment
-        for a in ann {
-            let region_type = &a["feature"]["region_type"];
-            let feature_id = a["feature"]["feature_id"].as_u64().unwrap() as usize;
+        for a in annotations {
+            let region_type = a.feature.region_type;
+            let feature_id = a.feature.feature_id;
             if !to_ref_index.contains_key(&feature_id) {
                 continue;
             }
             let feature_idx = to_ref_index[&feature_id];
-            let ref_start = a["annotation_match_start"].as_u64().unwrap() as usize;
-            if region_type == "L-REGION+V-REGION" {
-                v_stop = a["contig_match_end"].as_i64().unwrap() as usize;
-                v_stop_ref = a["annotation_match_end"].as_i64().unwrap() as usize;
+            let ref_start = a.annotation_match_start;
+            if region_type == VdjRegion::V {
+                v_stop = a.contig_match_end;
+                v_stop_ref = a.annotation_match_end;
             }
-            let gene_name = a["feature"]["gene_name"]
-                .to_string()
-                .between("\"", "\"")
-                .to_string();
+            let gene_name = a.feature.gene_name;
             if refdata.name[feature_idx] != gene_name && !accept_inconsistent {
                 return Err(format!(
                     "\nThere is an inconsistency between the reference \
@@ -383,43 +305,39 @@ fn parse_vector_entry_from_json(
                     refdata.name[feature_idx]
                 ));
             }
-            if region_type == "L-REGION+V-REGION" && ref_start == 0 {
-                let chain = a["feature"]["chain"]
-                    .to_string()
-                    .between("\"", "\"")
-                    .to_string();
-                // if !chain.starts_with("IG") { continue; } // *******************
-                tig_start = a["contig_match_start"].as_i64().unwrap() as isize;
+            if region_type == VdjRegion::V && ref_start == 0 {
+                let chain = a.feature.chain;
+
+                tig_start = a.contig_match_start as isize;
                 cdr3_start -= tig_start as usize;
-                chain_type = chain.clone();
-                if chain == *"IGH"
-                    || chain == *"TRB"
-                    || (chain == *"TRD" && ctl.gen_opt.gamma_delta)
+                if chain == VdjChain::IGH
+                    || chain == VdjChain::TRB
+                    || (chain == VdjChain::TRD && ctl.gen_opt.gamma_delta)
                 {
                     left = true;
                 }
                 v_ref_id = feature_idx;
-                cigarv = a["cigar"].to_string().between("\"", "\"").to_string();
+                cigarv = a.cigar;
             } else {
                 // also check for IG chain?????????????????????????????????????????
-                let ref_stop = a["annotation_match_end"].as_u64().unwrap() as usize;
-                let ref_len = a["annotation_length"].as_u64().unwrap() as usize;
-                if region_type == "J-REGION" && ref_stop == ref_len {
-                    tig_stop = a["contig_match_end"].as_i64().unwrap() as isize;
+                let ref_stop = a.annotation_match_end;
+                let ref_len = a.annotation_length;
+                if region_type == VdjRegion::J && ref_stop == ref_len {
+                    tig_stop = a.contig_match_end as isize;
                     j_ref_id = feature_idx;
-                    j_start = a["contig_match_start"].as_i64().unwrap() as usize;
-                    j_start_ref = a["annotation_match_start"].as_i64().unwrap() as usize;
+                    j_start = a.contig_match_start;
+                    j_start_ref = a.annotation_match_start;
                 }
-                if region_type == "5'UTR" {
+                if region_type == VdjRegion::UTR {
                     u_ref_id = Some(feature_idx);
                 }
-                if region_type == "D-REGION" {
-                    d_start = Some(a["contig_match_start"].as_i64().unwrap() as usize);
+                if region_type == VdjRegion::D {
+                    d_start = Some(a.contig_match_start);
                     d_ref_id = Some(feature_idx);
                 }
-                if region_type == "C-REGION" {
+                if region_type == VdjRegion::C {
                     c_ref_id = Some(feature_idx);
-                    c_start = Some(a["contig_match_start"].as_i64().unwrap() as usize);
+                    c_start = Some(a.contig_match_start);
                 }
             }
         }
@@ -481,7 +399,7 @@ fn parse_vector_entry_from_json(
         // inconsistencies, leading to an assert somewhere downstream.
 
         let mut cdr3 = Vec::<(usize, Vec<u8>, usize, usize)>::new();
-        let x = DnaString::from_dna_string(full_seq);
+        let x = DnaString::from_dna_string(&ann.sequence);
         get_cdr3_using_ann(&x, refdata, &annv, &mut cdr3);
         if cdr3.is_empty() {
             return Ok(res);
@@ -527,11 +445,9 @@ fn parse_vector_entry_from_json(
         return Err(json_error(Some(json), ctl.gen_opt.internal_run, &msg));
     }
     let (tig_start, tig_stop) = (tig_start as usize, tig_stop as usize);
-    let quals0 = v["quals"].to_string();
-    let quals0 = quals0.after("\"").as_bytes();
     let mut quals = Vec::<u8>::new();
     let mut slashed = false;
-    for &qual in quals0.iter().take(quals0.len() - 1) {
+    for &qual in ann.quals.as_bytes() {
         if !slashed && qual == b'\\'
         /* && ( i == 0 || quals0[i-1] != b'\\' ) */
         {
@@ -541,36 +457,42 @@ fn parse_vector_entry_from_json(
         slashed = false;
         quals.push(qual);
     }
-    assert_eq!(full_seq.len(), quals.len());
-    let seq = &full_seq[tig_start..tig_stop].to_string();
+    assert_eq!(ann.sequence.len(), quals.len());
+    let seq = &ann.sequence[tig_start..tig_stop].to_string();
     for qual in quals.iter_mut() {
         *qual -= 33_u8;
     }
     let full_quals = quals;
     let quals = full_quals[tig_start..tig_stop].to_vec();
-    let umi_count = v["umi_count"].as_i64().unwrap() as usize;
-    let read_count = v["read_count"].as_i64().unwrap() as usize;
-    let origin = origin_info.origin_for_bc[li].get(&barcode).or_else(|| {
-        // the way we use s1 here is flaky
-        if !origin_info.origin_id[li].is_empty()
-            && (origin_info.origin_id[li] != *"s1" || origin_info.origin_for_bc[li].is_empty())
-        {
-            Some(&origin_info.origin_id[li])
-        } else {
-            None
-        }
-    });
-    let donor = origin_info.donor_for_bc[li].get(&barcode).or_else(|| {
-        // the way we use d1 here is flaky
-        if !origin_info.origin_id[li].is_empty()
-            && (origin_info.donor_id[li] != *"d1" || origin_info.donor_for_bc[li].is_empty())
-        {
-            Some(&origin_info.donor_id[li])
-        } else {
-            None
-        }
-    });
-    let tag = origin_info.tag[li].get(&barcode);
+    let umi_count = ann.umi_count;
+    let read_count = ann.read_count;
+    let origin = origin_info.origin_for_bc[dataset_index]
+        .get(&ann.barcode)
+        .or_else(|| {
+            // the way we use s1 here is flaky
+            if !origin_info.origin_id[dataset_index].is_empty()
+                && (origin_info.origin_id[dataset_index] != *"s1"
+                    || origin_info.origin_for_bc[dataset_index].is_empty())
+            {
+                Some(&origin_info.origin_id[dataset_index])
+            } else {
+                None
+            }
+        });
+    let donor = origin_info.donor_for_bc[dataset_index]
+        .get(&ann.barcode)
+        .or_else(|| {
+            // the way we use d1 here is flaky
+            if !origin_info.origin_id[dataset_index].is_empty()
+                && (origin_info.donor_id[dataset_index] != *"d1"
+                    || origin_info.donor_for_bc[dataset_index].is_empty())
+            {
+                Some(&origin_info.donor_id[dataset_index])
+            } else {
+                None
+            }
+        });
+    let tag = origin_info.tag[dataset_index].get(&ann.barcode);
     let mut origin_index = None;
     let mut donor_index = None;
     let mut tag_index = None;
@@ -583,18 +505,7 @@ fn parse_vector_entry_from_json(
     if let Some(tag) = tag {
         tag_index = Some(bin_position(&origin_info.tag_list, tag) as usize);
     }
-    let mut valu = None;
-    if validated_umis_present {
-        valu = Some(validated_umis);
-    }
-    let mut non_valu = None;
-    if non_validated_umis_present {
-        non_valu = Some(non_validated_umis);
-    }
-    let mut invalu = None;
-    if invalidated_umis_present {
-        invalu = Some(invalidated_umis);
-    }
+
     res.tig = Some(TigData {
         cdr3_dna,
         len: seq.len(),
@@ -606,7 +517,7 @@ fn parse_vector_entry_from_json(
         j_start_ref,
         j_stop: tig_stop,
         c_start,
-        full_seq: full_seq.as_bytes().to_vec(),
+        full_seq: ann.sequence.as_bytes().to_vec(),
         v_ref_id,
         d_ref_id,
         j_ref_id,
@@ -621,10 +532,10 @@ fn parse_vector_entry_from_json(
         cdr3_start,
         quals,
         full_quals,
-        barcode,
-        tigname,
+        barcode: ann.barcode,
+        tigname: ann.contig_name,
         left,
-        dataset_index: li,
+        dataset_index,
         origin_index,
         donor_index,
         tag_index,
@@ -632,9 +543,9 @@ fn parse_vector_entry_from_json(
         read_count,
         chain_type,
         annv,
-        validated_umis: valu,
-        non_validated_umis: non_valu,
-        invalidated_umis: invalu,
+        validated_umis: ann.validated_umis,
+        non_validated_umis: ann.non_validated_umis,
+        invalidated_umis: ann.invalidated_umis,
         frac_reads_used,
     });
     Ok(res)
@@ -644,40 +555,23 @@ fn parse_vector_entry_from_json(
 
 // Parse the JSON annotations file.
 //
-// In the future could be converted to LazyWrite:
-// https://martian-lang.github.io/martian-rust/doc/martian_filetypes/json_file/
-// index.html#lazy-readwrite-example.
-//
 // Tracking contigs using bc_cdr3_aa; could improve later.
 //
 // This section requires 3.1.  If you want to avoid that, do something to make tig_start
 // and tig_stop always nonnegative.  Or use the RE option.
-//
-// Computational performance.  It would appear that nearly all the time here is spent in
-// two lines:
-//
-// read_vector_entry_from_json(&mut f) {
-// let v: Value = serde_json::from_str(strme(&x)).unwrap();
-// (Should retest.)
-//
-// and simply reading the file lines is several times faster.  So the way we parse the
-// files is suboptimal.  If we want to make this faster, one option would be to speed up
-// this code.  Another would be to write out a binary version of the JSON file that contains
-// only the information that we need.
 
 #[derive(Default)]
 struct ReadJsonResult {
     vdj_cells: Vec<String>,
     gex_cells: Vec<String>,
     gex_cells_specified: bool,
-    cr_version: String,
     tig_bc: Vec<Vec<TigData>>,
 }
 
 fn read_json(
     accept_inconsistent: bool,
     origin_info: &OriginInfo,
-    li: usize,
+    dataset_index: usize,
     json: &String,
     refdata: &RefData,
     to_ref_index: &HashMap<usize, usize>,
@@ -707,45 +601,43 @@ fn read_json(
              input files to enclone, including the PRE argument.\n"
         ));
     }
-    let mut f = BufReader::new(open_maybe_compressed(&jsonx));
-    // ◼ This loop could be speeded up, see comments above.
-    let mut xs = Vec::<Vec<u8>>::new();
-    loop {
-        let x = read_vector_entry_from_json(&mut f);
-        if x.is_err() {
-            eprintln!("\nProblem reading {jsonx}.\n");
-            return Err(x.err().unwrap());
-        }
-        match x.unwrap() {
-            None => break,
-            Some(x) => {
-                xs.push(x);
-            }
-        }
-    }
-    let results: Vec<_> = xs
-        .par_iter()
-        .map(|entry| {
+    // Read the entire file to memory before parsing.
+    // See https://github.com/serde-rs/json/issues/160
+    // The previous implementation was essentially doing this anyway, so it
+    // shouldn't drastically change our memory consumption.
+    let mut contents = String::new();
+    open_maybe_compressed(&jsonx)
+        .read_to_string(&mut contents)
+        .unwrap();
+
+    let mut results: Vec<_> = serde_json::Deserializer::from_str(&contents)
+        .into_iter::<ContigAnnotation>()
+        .enumerate()
+        .par_bridge()
+        .map(|(ann_index, entry)| {
             parse_vector_entry_from_json(
-                entry,
+                entry.unwrap(),
                 json,
                 accept_inconsistent,
                 origin_info,
-                li,
+                dataset_index,
                 refdata,
                 to_ref_index,
                 reannotate,
                 ctl,
             )
+            .map(|r| (ann_index, r))
         })
         .collect::<Result<Vec<_>, String>>()?;
+    // rayon's par_bridge feature doesn't preserve order, so sort the result
+    // for stable behavior.
+    results.sort_by_key(|(ann_index, _)| *ann_index);
 
     let mut tigs = Vec::new();
     let mut vdj_cells = Vec::new();
     let mut gex_cells = Vec::new();
     let mut gex_cells_specified = false;
-    let mut cr_version = String::new();
-    for result in results {
+    for (_, result) in results {
         if let Some(tig) = result.tig {
             tigs.push(tig);
         }
@@ -758,9 +650,6 @@ fn read_json(
         if result.gex_cells_specified {
             gex_cells_specified = true;
         }
-        if !result.cr_version.is_empty() {
-            cr_version = result.cr_version.clone();
-        }
     }
     unique_sort(&mut gex_cells);
     let mut tig_bc = Vec::<Vec<TigData>>::new();
@@ -818,7 +707,6 @@ fn read_json(
         vdj_cells,
         gex_cells,
         gex_cells_specified,
-        cr_version,
         tig_bc,
     })
 }

From c9ea09e9f3d0910c43080993bd057597c3f5f4d2 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Fri, 1 Mar 2024 16:48:35 -0800
Subject: [PATCH 06/15] Delete the manually-implemented JSON loading function.

---
 io_utils/src/lib.rs | 79 ---------------------------------------------
 1 file changed, 79 deletions(-)

diff --git a/io_utils/src/lib.rs b/io_utils/src/lib.rs
index f15a6b669..3b393a10a 100644
--- a/io_utils/src/lib.rs
+++ b/io_utils/src/lib.rs
@@ -251,85 +251,6 @@ pub fn get_metric_value(f: impl AsRef<Path>, metric: &str) -> String {
     String::default()
 }
 
-// ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
-// CODE FOR STREAMING A JSON VECTOR
-// ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
-
-// Read an entry from a json file that represents a vector.  This is not completely
-// general as it depends on assumptions about the formatting of the file.
-//
-// To compare to and probably replace with:
-// https://martian-lang.github.io/martian-rust/doc/martian_filetypes/json_file/
-// index.html#lazy-readwrite-example
-
-pub fn read_vector_entry_from_json<R: BufRead>(json: &mut R) -> Result<Option<Vec<u8>>, String> {
-    let mut line = String::new();
-    if json.read_line(&mut line).is_err() || line == *"" || line == *"[]" {
-        return Ok(None);
-    }
-    if line == *"[\n" {
-        line.clear();
-        if json.read_line(&mut line).is_err() {
-            return Err(
-                "\nProblem reading json file, probably due to a defect in it.\n".to_string(),
-            );
-        }
-    }
-    let mut entry = Vec::<u8>::new();
-    let (mut curlies, mut bracks, mut quotes) = (0_isize, 0_isize, 0_isize);
-    let mut s = line.as_bytes();
-    loop {
-        if (s == b"]" || s == b"]\n") && curlies == 0 && bracks == 0 && quotes % 2 == 0 {
-            if !entry.is_empty() {
-                return Ok(Some(entry));
-            } else {
-                return Ok(None);
-            }
-        }
-        let mut cpos = -1_isize;
-        if s.is_empty() {
-            return Err("\nError reading json file.  It is possible that the file \
-                was truncated.\n"
-                .to_string());
-        }
-        for i in (0..s.len() - 1).rev() {
-            if s[i] == b',' {
-                cpos = i as isize;
-                break;
-            }
-            if s[i] != b' ' {
-                break;
-            }
-        }
-        let mut escaped = false;
-        for i in 0..s.len() {
-            if !escaped && s[i] == b'"' {
-                quotes += 1;
-            } else if !escaped && quotes % 2 == 0 {
-                match s[i] {
-                    b'{' => curlies += 1,
-                    b'}' => curlies -= 1,
-                    b'[' => bracks += 1,
-                    b']' => bracks -= 1,
-                    b',' => {
-                        if i as isize == cpos && curlies == 0 && bracks == 0 && quotes % 2 == 0 {
-                            return Ok(Some(entry));
-                        }
-                    }
-                    _ => {}
-                };
-            }
-            escaped = s[i] == b'\\' && !escaped;
-            entry.push(s[i]);
-        }
-        line.clear();
-        if json.read_line(&mut line).is_err() {
-            return Err("\nSomething appears to be defective in a json file.\n".to_string());
-        }
-        s = line.as_bytes();
-    }
-}
-
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
 // READ FILE TO STRING AND PRINT FILE NAME IF IT DOESN'T EXIST
 // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓

From f81ed7c965bd2e168807c048e7b6ab227fdce001 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Mon, 4 Mar 2024 15:48:30 -0800
Subject: [PATCH 07/15] Replace lazy JSON reading with martian filetypes.

---
 Cargo.lock                    | 170 +++++++++++++++++++++++++++++++---
 enclone_args/Cargo.toml       |   1 +
 enclone_args/src/read_json.rs |  35 +++----
 3 files changed, 170 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0aaf9b057..94554fe38 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "addr2line"
-version = "0.20.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
 dependencies = [
  "gimli",
 ]
@@ -64,6 +64,9 @@ name = "anyhow"
 version = "1.0.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a26fa4d7e3f2eebadf743988fc8aec9fa9a9e82611acafd77c1462ed6262440a"
+dependencies = [
+ "backtrace",
+]
 
 [[package]]
 name = "approx"
@@ -117,9 +120,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "backtrace"
-version = "0.3.68"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
 dependencies = [
  "addr2line",
  "cc",
@@ -354,6 +357,27 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "csv"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "debruijn"
 version = "0.3.4"
@@ -380,6 +404,12 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+
 [[package]]
 name = "derive-new"
 version = "0.5.9"
@@ -462,6 +492,7 @@ dependencies = [
  "hdf5",
  "io_utils",
  "itertools",
+ "martian-filetypes",
  "rand",
  "rayon",
  "regex",
@@ -673,6 +704,15 @@ dependencies = [
  "instant",
 ]
 
+[[package]]
+name = "fern"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9f0c14694cbd524c8720dd69b0e3179344f04ebb5f90f2e4a440c6ea3b2f1ee"
+dependencies = [
+ "log",
+]
+
 [[package]]
 name = "filetime"
 version = "0.2.19"
@@ -752,9 +792,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.27.3"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "graph_simple"
@@ -1031,6 +1071,52 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "martian"
+version = "0.26.0"
+source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21"
+dependencies = [
+ "anyhow",
+ "backtrace",
+ "fern",
+ "heck",
+ "log",
+ "rustc_version",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "time",
+]
+
+[[package]]
+name = "martian-derive"
+version = "0.26.0"
+source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21"
+dependencies = [
+ "martian",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "syn 2.0.52",
+]
+
+[[package]]
+name = "martian-filetypes"
+version = "0.27.0"
+source = "git+https://github.com/martian-lang/martian-rust?branch=master#345490b52d2722fe30b042d78dcd601225aaee21"
+dependencies = [
+ "anyhow",
+ "bincode",
+ "csv",
+ "flate2",
+ "lz4",
+ "martian",
+ "martian-derive",
+ "serde",
+ "serde_json",
+ "zstd",
+]
+
 [[package]]
 name = "matches"
 version = "0.1.9"
@@ -1202,11 +1288,20 @@ dependencies = [
  "libm",
 ]
 
+[[package]]
+name = "num_threads"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "object"
-version = "0.31.1"
+version = "0.32.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
 dependencies = [
  "memchr",
 ]
@@ -1385,7 +1480,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5"
 dependencies = [
  "proc-macro2",
- "syn 2.0.50",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1479,7 +1574,7 @@ dependencies = [
  "prost 0.12.3",
  "prost-types 0.12.3",
  "regex",
- "syn 2.0.50",
+ "syn 2.0.52",
  "tempfile",
  "which",
 ]
@@ -1507,7 +1602,7 @@ dependencies = [
  "itertools",
  "proc-macro2",
  "quote",
- "syn 2.0.50",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1663,9 +1758,18 @@ dependencies = [
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.21"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"
+checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
+dependencies = [
+ "semver",
+]
 
 [[package]]
 name = "rustix"
@@ -1740,6 +1844,12 @@ dependencies = [
  "untrusted",
 ]
 
+[[package]]
+name = "semver"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca"
+
 [[package]]
 name = "serde"
 version = "1.0.156"
@@ -1904,9 +2014,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.50"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74f1bdc9872430ce9b75da68329d1c1746faf50ffac5f19e02b71e37ff881ffb"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1981,6 +2091,36 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "time"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a79d09ac6b08c1ab3906a2f7cc2e81a0e27c7ae89c63812df75e52bef0751e07"
+dependencies = [
+ "deranged",
+ "itoa",
+ "libc",
+ "num_threads",
+ "serde",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+
+[[package]]
+name = "time-macros"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75c65469ed6b3a4809d987a41eb1dc918e9bc1d92211cbad7ae82931846f7451"
+dependencies = [
+ "time-core",
+]
+
 [[package]]
 name = "tinyvec"
 version = "1.6.0"
diff --git a/enclone_args/Cargo.toml b/enclone_args/Cargo.toml
index 92a392fab..d634a1ffc 100644
--- a/enclone_args/Cargo.toml
+++ b/enclone_args/Cargo.toml
@@ -30,6 +30,7 @@ evalexpr = ">=7, <12"
 expr_tools = { path = "../expr_tools" }
 io_utils = { path = "../io_utils" }
 itertools.workspace = true
+martian-filetypes = { git = "https://github.com/martian-lang/martian-rust", branch = "master" }
 rand = "0.8"
 rayon = "1"
 regex = { version = "1", default-features = false, features = ["std", "perf"] }
diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index 745ce2f27..8daa80e9d 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -7,10 +7,13 @@ use debruijn::dna_string::DnaString;
 use enclone_core::barcode_fate::BarcodeFate;
 use enclone_core::defs::{EncloneControl, OriginInfo, TigData};
 use io_utils::{open_maybe_compressed, path_exists};
+use martian_filetypes::json_file::{Json, LazyJsonReader};
+use martian_filetypes::LazyRead;
 use rand::Rng;
 use rayon::prelude::*;
 use std::collections::HashMap;
 use std::fmt::Write;
+use std::io::BufReader;
 use string_utils::{stringme, strme, TextUtils};
 use vdj_ann::annotate::ContigAnnotation;
 use vdj_ann::{annotate, refx, transcript};
@@ -69,7 +72,7 @@ struct JsonParseResult {
     tig: Option<TigData>,
 }
 
-fn parse_vector_entry_from_json(
+fn process_json_annotation(
     ann: ContigAnnotation,
     json: &str,
     accept_inconsistent: bool,
@@ -601,21 +604,15 @@ fn read_json(
              input files to enclone, including the PRE argument.\n"
         ));
     }
-    // Read the entire file to memory before parsing.
-    // See https://github.com/serde-rs/json/issues/160
-    // The previous implementation was essentially doing this anyway, so it
-    // shouldn't drastically change our memory consumption.
-    let mut contents = String::new();
-    open_maybe_compressed(&jsonx)
-        .read_to_string(&mut contents)
-        .unwrap();
-
-    let mut results: Vec<_> = serde_json::Deserializer::from_str(&contents)
-        .into_iter::<ContigAnnotation>()
-        .enumerate()
-        .par_bridge()
-        .map(|(ann_index, entry)| {
-            parse_vector_entry_from_json(
+
+    let reader: LazyJsonReader<_, Json, _> =
+        LazyJsonReader::with_reader(BufReader::new(open_maybe_compressed(&jsonx)))
+            .map_err(|err| format!("{err:#?}"))?;
+
+    let results: Vec<_> = reader
+        .into_iter()
+        .map(|entry| {
+            process_json_annotation(
                 entry.unwrap(),
                 json,
                 accept_inconsistent,
@@ -626,18 +623,14 @@ fn read_json(
                 reannotate,
                 ctl,
             )
-            .map(|r| (ann_index, r))
         })
         .collect::<Result<Vec<_>, String>>()?;
-    // rayon's par_bridge feature doesn't preserve order, so sort the result
-    // for stable behavior.
-    results.sort_by_key(|(ann_index, _)| *ann_index);
 
     let mut tigs = Vec::new();
     let mut vdj_cells = Vec::new();
     let mut gex_cells = Vec::new();
     let mut gex_cells_specified = false;
-    for (_, result) in results {
+    for result in results {
         if let Some(tig) = result.tig {
             tigs.push(tig);
         }

From 9ab8d960bdf3a15225bd2055d222799f1d8448af Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Mon, 4 Mar 2024 16:00:06 -0800
Subject: [PATCH 08/15] Add syn to deny.toml.

---
 deny.toml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/deny.toml b/deny.toml
index 98dbe080c..a3e328104 100644
--- a/deny.toml
+++ b/deny.toml
@@ -139,3 +139,9 @@ github = ["10XGenomics"]
 gitlab = []
 # 1 or more bitbucket.org organizations to allow git sources for
 bitbucket = []
+
+
+[[bans.skip]]
+# many packages depend on syn 1
+name = "syn"
+version = "1.0.105"

From 245ca15d5d7093026ca46321116e070a81382752 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Mon, 4 Mar 2024 16:05:28 -0800
Subject: [PATCH 09/15] Allow martian-lang git repos.

---
 deny.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deny.toml b/deny.toml
index a3e328104..89e15694c 100644
--- a/deny.toml
+++ b/deny.toml
@@ -130,6 +130,7 @@ allow-registry = ["https://github.com/rust-lang/crates.io-index"]
 allow-git = [
     # TODO: remove this
     "https://github.com/Barandis/qd",
+    "https://github.com/martian-lang/martian-rust",
 ]
 
 [sources.allow-org]

From ad13f638fe5900e5e401842228e066d2527ef353 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Mon, 4 Mar 2024 16:38:49 -0800
Subject: [PATCH 10/15] Remove the not-needed manual de-escaping code.

---
 enclone_args/src/read_json.rs | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index 8daa80e9d..702177869 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -448,21 +448,10 @@ fn process_json_annotation(
         return Err(json_error(Some(json), ctl.gen_opt.internal_run, &msg));
     }
     let (tig_start, tig_stop) = (tig_start as usize, tig_stop as usize);
-    let mut quals = Vec::<u8>::new();
-    let mut slashed = false;
-    for &qual in ann.quals.as_bytes() {
-        if !slashed && qual == b'\\'
-        /* && ( i == 0 || quals0[i-1] != b'\\' ) */
-        {
-            slashed = true;
-            continue;
-        }
-        slashed = false;
-        quals.push(qual);
-    }
-    assert_eq!(ann.sequence.len(), quals.len());
+    let mut quals = ann.quals.as_bytes().to_vec();
+    assert_eq!(ann.sequence.len(), ann.quals.as_bytes().len());
     let seq = &ann.sequence[tig_start..tig_stop].to_string();
-    for qual in quals.iter_mut() {
+    for qual in &mut quals {
         *qual -= 33_u8;
     }
     let full_quals = quals;

From bdd34395d66fdd6181429df5a14e201ad540bf37 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Tue, 5 Mar 2024 15:59:44 -0800
Subject: [PATCH 11/15] Re-add missing chain_type setter.

---
 enclone_args/src/read_json.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index 702177869..1fc6f03b2 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -310,7 +310,7 @@ fn process_json_annotation(
             }
             if region_type == VdjRegion::V && ref_start == 0 {
                 let chain = a.feature.chain;
-
+                chain_type = chain.to_string();
                 tig_start = a.contig_match_start as isize;
                 cdr3_start -= tig_start as usize;
                 if chain == VdjChain::IGH

From 4a35a0c0208cbc54121951caf44f88d0ef258d4b Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Tue, 5 Mar 2024 16:12:20 -0800
Subject: [PATCH 12/15] Load annotations as empty vec if missing.

---
 vdj_ann/src/annotate.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs
index 38bd005f5..0fecda3ae 100644
--- a/vdj_ann/src/annotate.rs
+++ b/vdj_ann/src/annotate.rs
@@ -3007,9 +3007,10 @@ pub struct ContigAnnotation {
     pub fwr4: Option<Region>,
 
     // annotations
+    #[serde(default)]
     pub annotations: Vec<AnnotationUnit>, // the annotations
-    pub clonotype: Option<String>,        // null, filled in later
-    pub info: ClonotypeInfo,              // Empty initially, may be filled in later
+    pub clonotype: Option<String>, // null, filled in later
+    pub info: ClonotypeInfo,       // Empty initially, may be filled in later
 
     // state of the contig
     pub high_confidence: bool,               // declared high confidence?

From a5cdba03f058aebedf6e9796befdd4bf14f0b248 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Tue, 5 Mar 2024 16:22:24 -0800
Subject: [PATCH 13/15] Load info as default if missing.

---
 vdj_ann/src/annotate.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs
index 0fecda3ae..527f32914 100644
--- a/vdj_ann/src/annotate.rs
+++ b/vdj_ann/src/annotate.rs
@@ -3010,7 +3010,8 @@ pub struct ContigAnnotation {
     #[serde(default)]
     pub annotations: Vec<AnnotationUnit>, // the annotations
     pub clonotype: Option<String>, // null, filled in later
-    pub info: ClonotypeInfo,       // Empty initially, may be filled in later
+    #[serde(default)]
+    pub info: ClonotypeInfo, // Empty initially, may be filled in later
 
     // state of the contig
     pub high_confidence: bool,               // declared high confidence?

From dcf69f9727a3776a6936ff16bed8f370ddc51d9e Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Tue, 5 Mar 2024 16:26:52 -0800
Subject: [PATCH 14/15] Load filtered as default if missing.

---
 vdj_ann/src/annotate.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs
index 527f32914..adc59d263 100644
--- a/vdj_ann/src/annotate.rs
+++ b/vdj_ann/src/annotate.rs
@@ -3020,7 +3020,8 @@ pub struct ContigAnnotation {
     pub invalidated_umis: Option<Vec<String>>, // invalidated UMIs
     pub is_cell: bool,                       // was the barcode declared a cell?
     pub productive: Option<bool>,            // productive?  (null means not full length)
-    pub filtered: bool,                      // true and never changed (unused field)
+    #[serde(default = "set_true")]
+    pub filtered: bool, // true and never changed (unused field)
 
     pub is_gex_cell: Option<bool>, // Was the barcode declared a cell by Gene expression data, if available
     pub is_asm_cell: Option<bool>, // Was the barcode declared a cell by the VDJ assembler
@@ -3032,6 +3033,10 @@ pub struct ContigAnnotation {
     pub sample: Option<String>,
 }
 
+fn set_true() -> bool {
+    true
+}
+
 impl ContigAnnotation {
     // Given the alignment entities produced by annotate_seq, produce a
     // ContigAnnotation.  This is done so as to produce at most one V, D, J and C,

From 644438be90174e76c9bda6d68f36f47bd9109511 Mon Sep 17 00:00:00 2001
From: Chris Macklin <chris.macklin@10xgenomics.com>
Date: Wed, 6 Mar 2024 12:15:25 -0800
Subject: [PATCH 15/15] Refactor JSON loading to avoid collecting into an
 intermediate vector.

---
 enclone_args/src/read_json.rs | 45 ++++++++++++++++-------------------
 enclone_stuff/src/start.rs    |  2 +-
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs
index 1fc6f03b2..c08ebec52 100644
--- a/enclone_args/src/read_json.rs
+++ b/enclone_args/src/read_json.rs
@@ -27,7 +27,7 @@ fn json_error(json: Option<&str>, internal_run: bool, msg: &str) -> String {
         "There is something wrong with the contig annotations in the cellranger output file"
             .to_string();
     if let Some(json) = json {
-        write!(msgx, "\n{}.", json).unwrap();
+        write!(msgx, "\n{json}.").unwrap();
     } else {
         msgx += ".";
     }
@@ -594,32 +594,27 @@ fn read_json(
         ));
     }
 
-    let reader: LazyJsonReader<_, Json, _> =
-        LazyJsonReader::with_reader(BufReader::new(open_maybe_compressed(&jsonx)))
-            .map_err(|err| format!("{err:#?}"))?;
-
-    let results: Vec<_> = reader
-        .into_iter()
-        .map(|entry| {
-            process_json_annotation(
-                entry.unwrap(),
-                json,
-                accept_inconsistent,
-                origin_info,
-                dataset_index,
-                refdata,
-                to_ref_index,
-                reannotate,
-                ctl,
-            )
-        })
-        .collect::<Result<Vec<_>, String>>()?;
-
     let mut tigs = Vec::new();
     let mut vdj_cells = Vec::new();
     let mut gex_cells = Vec::new();
     let mut gex_cells_specified = false;
-    for result in results {
+
+    let reader: LazyJsonReader<ContigAnnotation, Json, _> =
+        LazyJsonReader::with_reader(BufReader::new(open_maybe_compressed(&jsonx)))
+            .map_err(|err| format!("{err:#?}"))?;
+
+    for entry in reader.into_iter() {
+        let result = process_json_annotation(
+            entry.map_err(|err| err.to_string())?,
+            json,
+            accept_inconsistent,
+            origin_info,
+            dataset_index,
+            refdata,
+            to_ref_index,
+            reannotate,
+            ctl,
+        )?;
         if let Some(tig) = result.tig {
             tigs.push(tig);
         }
@@ -720,8 +715,8 @@ pub fn parse_json_annotations_files(
         .par_iter()
         .enumerate()
         .map(|(li, dataset_path)| {
-            let json = format!("{}/{ann}", dataset_path);
-            let json_lz4 = format!("{}/{ann}.lz4", dataset_path);
+            let json = format!("{dataset_path}/{ann}");
+            let json_lz4 = format!("{dataset_path}/{ann}.lz4");
             if !path_exists(&json) && !path_exists(&json_lz4) {
                 return Err(format!("\ncan't find {json} or {json_lz4}\n"));
             }
diff --git a/enclone_stuff/src/start.rs b/enclone_stuff/src/start.rs
index 62c4784c0..52ae52f93 100644
--- a/enclone_stuff/src/start.rs
+++ b/enclone_stuff/src/start.rs
@@ -20,7 +20,7 @@ use enclone::misc2::{check_for_barcode_reuse, find_exact_subclonotypes, search_f
 use enclone::misc3::sort_tig_bc;
 use enclone_args::read_json::{parse_json_annotations_files, Annotations};
 use enclone_core::barcode_fate::BarcodeFate;
-use enclone_core::defs::{AlleleData, CloneInfo, TigData};
+use enclone_core::defs::{AlleleData, CloneInfo};
 use enclone_core::enclone_structs::{EncloneExacts, EncloneIntermediates, EncloneSetup};
 use enclone_core::hcomp::heavy_complexity;
 use enclone_print::define_mat::{define_mat, setup_define_mat};