Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

further bin format cleanups #386

Merged
merged 4 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 16 additions & 34 deletions enclone_args/src/load_gex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@ use vector_utils::{bin_position, unique_sort};
pub fn get_gex_info(ctl: &mut EncloneControl) -> Result<GexInfo, String> {
let mut gex_features = Vec::<Vec<String>>::new();
let mut gex_barcodes = Vec::<Vec<String>>::new();
let mut fb_total_umis = Vec::<u64>::new();
let mut fb_total_reads = Vec::<u64>::new();
let mut fb_brn = Vec::<Vec<(String, u32, u32)>>::new();
let mut fb_brnr = Vec::<Vec<(String, u32, u32)>>::new();
let mut fb_bdcs = Vec::<Vec<(String, u32, u32, u32)>>::new();
let mut feature_refs = Vec::<String>::new();
let mut cluster = Vec::<HashMap<String, usize>>::new();
let mut cell_type = Vec::<HashMap<String, String>>::new();
Expand All @@ -42,11 +37,6 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result<GexInfo, String> {
ctl,
&mut gex_features,
&mut gex_barcodes,
&mut fb_total_umis,
&mut fb_total_reads,
&mut fb_brn,
&mut fb_brnr,
&mut fb_bdcs,
&mut feature_refs,
&mut cluster,
&mut cell_type,
Expand Down Expand Up @@ -86,29 +76,26 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result<GexInfo, String> {
let mut h5_data = Vec::<Option<Dataset>>::new();
let mut h5_indices = Vec::<Option<Dataset>>::new();
let mut h5_indptr = Vec::<Vec<u32>>::new();
if ctl.gen_opt.h5 {
let gex_outs = &ctl.origin_info.gex_path;
Comment on lines -89 to -90
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could there be any inter-dependency between these two? Maybe some validation on the enclone side that ctl.origin_info.gex_path is present and is of length ctl.origin_info.dataset_path.len() if ctl.gen_opt.h5 is true. Wondering if we can get index out of bounds for gex_outs[i]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I deleted this because after my refactoring, there is no condition under which gen_opt.h5 is ever false. This was set by the argument NH5 which was only used to tell enclone to read the (now deleted) binary count matrix instead of the .h5 version.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like they are populated together:

ctl.origin_info.dataset_path.push(result.0);
ctl.origin_info.gex_path.push(result.1);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this refactoring process I've been trying to be very careful to make sure that I've confirmed that a conditional is constrained to exactly one path before removing it.

for i in 0..ctl.origin_info.dataset_path.len() {
// let bin_file = format!("{}/feature_barcode_matrix.bin", gex_outs[i]);
if !gex_outs[i].is_empty()
/* && !(path_exists(&bin_file) && !ctl.gen_opt.force_h5) */
{
let f = &h5_paths[i];

let h = hdf5::File::open(f).unwrap();
let gex_outs = &ctl.origin_info.gex_path;
for i in 0..ctl.origin_info.dataset_path.len() {
if !gex_outs[i].is_empty() {
let f = &h5_paths[i];

h5_data.push(Some(h.dataset("matrix/data").unwrap()));
h5_indices.push(Some(h.dataset("matrix/indices").unwrap()));
let indptr = h.dataset("matrix/indptr").unwrap();
let x: Vec<u32> = indptr.as_reader().read().unwrap().to_vec();
h5_indptr.push(x);
} else {
h5_data.push(None);
h5_indices.push(None);
h5_indptr.push(Vec::<u32>::new());
}
let h = hdf5::File::open(f).unwrap();

h5_data.push(Some(h.dataset("matrix/data").unwrap()));
h5_indices.push(Some(h.dataset("matrix/indices").unwrap()));
let indptr = h.dataset("matrix/indptr").unwrap();
let x: Vec<u32> = indptr.as_reader().read().unwrap().to_vec();
h5_indptr.push(x);
} else {
h5_data.push(None);
h5_indices.push(None);
h5_indptr.push(Vec::<u32>::new());
}
}

fn compute_feature_id(gex_features: &[String]) -> HashMap<String, usize> {
let mut x = HashMap::<String, usize>::new();
for (j, f) in gex_features.iter().enumerate() {
Expand Down Expand Up @@ -152,11 +139,6 @@ pub fn get_gex_info(ctl: &mut EncloneControl) -> Result<GexInfo, String> {
Ok(GexInfo {
gex_features,
gex_barcodes,
fb_total_umis,
fb_total_reads,
fb_brn,
fb_brnr,
fb_bdcs,
feature_refs,
cluster,
cell_type,
Expand Down
98 changes: 1 addition & 97 deletions enclone_args/src/load_gex_core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,7 @@ use io_utils::{dir_list, open_for_read, open_userfile_for_read, path_exists};
use itertools::Itertools;
use rayon::prelude::*;
use serde_json::Value;
use std::{
collections::HashMap,
convert::TryInto,
fmt::Write,
fs::read_to_string,
io::{BufRead, Read},
time::Instant,
};
use std::{collections::HashMap, fmt::Write, fs::read_to_string, io::BufRead, time::Instant};
use string_utils::{parse_csv, TextUtils};
use vector_utils::{unique_sort, VecUtils};

Expand All @@ -41,23 +34,13 @@ struct LoadResult {
feature_metrics: HashMap<(String, String), String>,
json_metrics: HashMap<String, f64>,
metrics: String,
fb_total_umis: u64,
fb_brn: Vec<(String, u32, u32)>,
feature_refs: String,
fb_brnr: Vec<(String, u32, u32)>,
fb_total_reads: u64,
fb_bdcs: Vec<(String, u32, u32, u32)>,
}

pub fn load_gex(
ctl: &mut EncloneControl,
gex_features: &mut Vec<Vec<String>>,
gex_barcodes: &mut Vec<Vec<String>>,
fb_total_umis: &mut Vec<u64>,
fb_total_reads: &mut Vec<u64>,
fb_brn: &mut Vec<Vec<(String, u32, u32)>>,
fb_brnr: &mut Vec<Vec<(String, u32, u32)>>,
fb_bdcs: &mut Vec<Vec<(String, u32, u32, u32)>>,
feature_refs: &mut Vec<String>,
cluster: &mut Vec<HashMap<String, usize>>,
cell_type: &mut Vec<HashMap<String, String>>,
Expand Down Expand Up @@ -548,80 +531,6 @@ pub fn load_gex(
r.gex_mult = gene_mult;
r.fb_mult = fb_mult;

// Read the total UMIs.

let top_file = fnx(&outs, "feature_barcode_matrix_top.total");
if path_exists(&top_file) {
pathlist.push(top_file.clone());
let mut f = open_for_read![&top_file];
let mut bytes = Vec::<u8>::new();
f.read_to_end(&mut bytes).unwrap();
r.fb_total_umis = u64::from_ne_bytes(bytes.try_into().unwrap());
}

// Read the total reads.

let top_file = fnx(&outs, "feature_barcode_matrix_top.total_reads");
if path_exists(&top_file) {
pathlist.push(top_file.clone());
let mut f = open_for_read![&top_file];
let mut bytes = Vec::<u8>::new();
f.read_to_end(&mut bytes).unwrap();
r.fb_total_reads = u64::from_ne_bytes(bytes.try_into().unwrap());
}

// Read the barcode-ref-nonref UMI count file.

let brn_file = fnx(&outs, "feature_barcode_matrix_top.brn");
if path_exists(&brn_file) {
pathlist.push(brn_file.clone());
let f = open_for_read![&brn_file];
for line in f.lines() {
let s = line.unwrap();
let fields = parse_csv(&s);
r.fb_brn.push((
fields[0].to_string(),
fields[1].parse::<u32>().unwrap(),
fields[2].parse::<u32>().unwrap(),
));
}
}

// Read the barcode-ref-nonref read count file.

let brnr_file = fnx(&outs, "feature_barcode_matrix_top.brnr");
if path_exists(&brnr_file) {
pathlist.push(brnr_file.clone());
let f = open_for_read![&brnr_file];
for line in f.lines() {
let s = line.unwrap();
let fields = parse_csv(&s);
r.fb_brnr.push((
fields[0].to_string(),
fields[1].parse::<u32>().unwrap(),
fields[2].parse::<u32>().unwrap(),
));
}
}

// Read the bdcs read count file.

let bdcs_file = fnx(&outs, "feature_barcode_matrix_top.bdcs");
if path_exists(&bdcs_file) {
pathlist.push(bdcs_file.clone());
let f = open_for_read![&bdcs_file];
for line in f.lines() {
let s = line.unwrap();
let fields = parse_csv(&s);
r.fb_bdcs.push((
fields[0].to_string(),
fields[1].parse::<u32>().unwrap(),
fields[2].parse::<u32>().unwrap(),
fields[3].parse::<u32>().unwrap(),
));
}
}

// Read the feature reference file.

let fref_file = fnx(&outs, "feature_reference.csv");
Expand Down Expand Up @@ -717,12 +626,7 @@ pub fn load_gex(
feature_metrics.push(r.feature_metrics);
json_metrics.push(r.json_metrics);
metrics.push(r.metrics);
fb_total_umis.push(r.fb_total_umis);
fb_brn.push(r.fb_brn);
feature_refs.push(r.feature_refs);
fb_brnr.push(r.fb_brnr);
fb_total_reads.push(r.fb_total_reads);
fb_bdcs.push(r.fb_bdcs);
}

// Done.
Expand Down
6 changes: 0 additions & 6 deletions enclone_core/src/defs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ pub struct GeneralOpt {
pub fate_file: String,
// Optional path to a json file containing metadata
pub proto_metadata: Option<String>,
pub h5: bool,
pub h5_pre: bool,
pub accept_reuse: bool,
pub descrip: bool,
Expand Down Expand Up @@ -823,11 +822,6 @@ pub struct CloneInfo {
pub struct GexInfo {
pub gex_features: Vec<Vec<String>>,
pub gex_barcodes: Vec<Vec<String>>,
pub fb_total_umis: Vec<u64>,
pub fb_total_reads: Vec<u64>,
pub fb_brn: Vec<Vec<(String, u32, u32)>>,
pub fb_brnr: Vec<Vec<(String, u32, u32)>>,
pub fb_bdcs: Vec<Vec<(String, u32, u32, u32)>>,
pub feature_refs: Vec<String>,
pub gex_cell_barcodes: Vec<Vec<String>>,
pub cluster: Vec<HashMap<String, usize>>,
Expand Down
1 change: 0 additions & 1 deletion enclone_core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ pub mod hcomp;
pub mod join_one;
pub mod linear_condition;
pub mod logging;
pub mod main_testlist;
pub mod mammalian_fixed_len;
pub mod median;
pub mod opt_d;
Expand Down
Loading
Loading