diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index 420b93b37..cd7bd2bdf 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -142,8 +142,7 @@ fn process_json_annotation( // Reannotate. if reannotate || ctl.gen_opt.reprod { let x = DnaString::from_dna_string(&ann.sequence); - let mut ann1 = Vec::::new(); - annotate_seq(&x, refdata, &mut ann1, true, false, true); + let mut ann1 = annotate_seq(&x, refdata, true, false, true); // If there are multiple V segment alignments, possibly reduce to just one. diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index b7cfa278b..e0d47669a 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -140,8 +140,8 @@ pub fn chain_type(b: &DnaString, rkmers_plus_full_20: &[(Kmer20, i32, i32)], rty // The structure of the output is: // { ( start on sequence, match length, ref tig, start on ref tig, mismatches on sequence ) }. -#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)] /// Annotation object denotes matches of a DnaString sequence to a reference sequence +#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)] pub struct Annotation { /// Start position on sequence (contig) pub tig_start: i32, @@ -155,13 +155,25 @@ pub struct Annotation { pub mismatches: i32, } -#[derive(Ord, PartialOrd, PartialEq, Eq)] -/// Used prior to creatrion of Annotation object, includes a vector of mismatches -struct PreAnnotation { +impl From for Annotation { + fn from(a: Alignment) -> Self { + Self { + tig_start: a.tig_start, + match_len: a.len, + ref_id: a.ref_id, + ref_start: a.ref_start, + mismatches: a.mismatches.len() as i32, + } + } +} + +/// Represents an alignment between the a reference and the contig. +#[derive(PartialEq, Eq)] +struct Alignment { /// Start position on sequence (contig) tig_start: i32, /// Match length - match_len: i32, + len: i32, /// Reference id (that this contig matches) ref_id: i32, /// Start position on reference @@ -170,31 +182,57 @@ struct PreAnnotation { mismatches: Vec, } +impl Alignment { + /// Get the offset between the position on the contig and the position on the ref. + fn offset(&self) -> i32 { + self.ref_start - self.tig_start + } + + /// Sort alignments by (tig_start, len, ref_id, ref_start, mismatches). + fn cmp_by_tig_start_len(a: &Self, b: &Self) -> std::cmp::Ordering { + let key: for<'a> fn(&'a Self) -> (_, _, _, _, &'a Vec<_>) = + |x: &Self| (x.tig_start, x.len, x.ref_id, x.ref_start, &x.mismatches); + key(a).cmp(&key(b)) + } + + /// Sort alignments by (ref_id, offset, tig_start, len, mismatches). + fn cmp_by_ref_id_offset(a: &Self, b: &Self) -> std::cmp::Ordering { + let key: for<'a> fn(&'a Self) -> (_, _, _, _, &'a Vec<_>) = + |x: &Self| (x.ref_id, x.offset(), x.tig_start, x.len, &x.mismatches); + key(a).cmp(&key(b)) + } + + /// Sort alignments by (ref_id, ref_start, tig_start, len, mismatches). + fn cmp_by_ref_id_ref_start(a: &Self, b: &Self) -> std::cmp::Ordering { + let key: for<'a> fn(&'a Self) -> (_, _, _, _, &'a Vec<_>) = + |x: &Self| (x.ref_id, x.ref_start, x.tig_start, x.len, &x.mismatches); + key(a).cmp(&key(b)) + } +} + pub fn annotate_seq( b: &DnaString, refdata: &RefData, - ann: &mut Vec, allow_weak: bool, allow_improper: bool, abut: bool, -) { +) -> Vec { let mut log = Vec::::new(); annotate_seq_core( b, refdata, - ann, allow_weak, allow_improper, abut, &mut log, false, - ); + ) } -fn print_alignx(log: &mut Vec, a: &PreAnnotation, refdata: &RefData) { +fn print_alignx(log: &mut Vec, a: &Alignment, refdata: &RefData) { let t = a.ref_id as usize; let l = a.tig_start; - let len = a.match_len; + let len = a.len; let p = a.ref_start; let mis = a.mismatches.len(); fwriteln!( @@ -212,7 +250,7 @@ fn print_alignx(log: &mut Vec, a: &PreAnnotation, refdata: &RefData) { fn report_semis( verbose: bool, title: &str, - semi: &[SemiPerfectMatch], + semi: &[Alignment], b_seq: &[u8], refs: &[DnaString], log: &mut Vec, @@ -224,27 +262,23 @@ fn report_semis( log, "t = {}, offset = {}, tig start = {}, ref start = {}, len = {}, mis = {}", s.ref_id, - s.offset, + s.offset(), s.tig_start, - s.offset + s.tig_start, + s.ref_start, s.len, s.mismatches.len(), ); - let t = s.ref_id as usize; - let off = s.offset; - let tig_start = s.tig_start; - let ref_start = off + tig_start; - let len = s.len; - let mis = &s.mismatches; let mut new_mis = Vec::::new(); - for j in 0..len { - if b_seq[(tig_start + j) as usize] != refs[t].get((ref_start + j) as usize) { - new_mis.push(tig_start + j); + for j in 0..s.len { + if b_seq[(s.tig_start + j) as usize] + != refs[s.ref_id as usize].get((s.ref_start + j) as usize) + { + new_mis.push(s.tig_start + j); } } - if new_mis != *mis { + if new_mis != s.mismatches { fwriteln!(log, " [INVALID]"); - fwriteln!(log, "computed = {}", mis.iter().format(",")); + fwriteln!(log, "computed = {}", s.mismatches.iter().format(",")); fwriteln!(log, "correct = {}", new_mis.iter().format(",")); } else { fwriteln!(log, ""); @@ -253,46 +287,6 @@ fn report_semis( } } -// TODO: combine this and the PreAnnotation type above? -// This is tricky, though, as both types derive sorting, and the order of fields -// implies different sort ordering. Probably the best way to address this is to -// remove the derivation of eq/ord, provide key-generating methods with -// explicit names, and use those. This forces the caller to be explicit about -// which sort order they're using. -#[derive(PartialEq, Eq, PartialOrd, Ord)] -struct PerfectMatch { - pub ref_id: i32, - /// ref_start - tig_start - pub offset: i32, - pub tig_start: i32, - // len - pub len: i32, -} - -// TODO: combine this and the PreAnnotation type above? -// This is tricky, though, as both types derive sorting, and the order of fields -// implies different sort ordering. Probably the best way to address this is to -// remove the derivation of eq/ord, provide key-generating methods with -// explicit names, and use those. This forces the caller to be explicit about -// which sort order they're using. -#[derive(PartialEq, Eq, PartialOrd, Ord)] -struct SemiPerfectMatch { - pub ref_id: i32, - /// off = pos on ref - pos on b - pub offset: i32, - /// pos on b - pub tig_start: i32, - pub len: i32, - /// positions on b of mismatches - pub mismatches: Vec, -} - -#[derive(PartialEq, Eq, PartialOrd, Ord)] -struct Offset { - pub ref_id: i32, - pub offset: i32, -} - /// Minimum length of sequence we'll try to annotate. const K: usize = 12; @@ -303,13 +297,12 @@ const MIN_PERF_EXT: usize = 5; pub fn annotate_seq_core( b: &DnaString, refdata: &RefData, - ann: &mut Vec, allow_weak: bool, allow_improper: bool, abut: bool, log: &mut Vec, verbose: bool, -) { +) -> Vec { // The DNA string representation is inefficient because it stores bases as packed k-mers // which requires a lot of array bounds checks when unpacking which was a hot path // we found when profiling the CI job. To avoid those in the inner @@ -317,7 +310,7 @@ pub fn annotate_seq_core( let b_seq = b.to_bytes(); if b.len() < K { - return; + return Vec::new(); } let mut perf = find_perfect_matches_initial(b, refdata, &b_seq, allow_weak); if verbose { @@ -327,9 +320,9 @@ pub fn annotate_seq_core( log, "t = {}, offset = {}, tig start = {}, ref start = {}, len = {}", s.ref_id, - s.offset, + s.offset(), s.tig_start, - s.offset + s.tig_start, + s.ref_start, s.len, ); } @@ -338,7 +331,7 @@ pub fn annotate_seq_core( perf.extend(new_matches); // Sort perfect matches. - perf.sort_unstable(); + perf.sort_unstable_by(Alignment::cmp_by_ref_id_offset); if verbose { fwriteln!(log, "\nPERF ALIGNMENTS\n"); for s in &perf { @@ -346,9 +339,9 @@ pub fn annotate_seq_core( log, "t = {}, offset = {}, tig start = {}, ref start = {}, len = {}", s.ref_id, - s.offset, + s.offset(), s.tig_start, - s.offset + s.tig_start, + s.ref_start, s.len, ); } @@ -420,17 +413,9 @@ pub fn annotate_seq_core( log, ); - let mut annx: Vec<_> = semi - .into_iter() - .map(|x| PreAnnotation { - tig_start: x.tig_start, - match_len: x.len, - ref_id: x.ref_id, - ref_start: x.tig_start + x.offset, - mismatches: x.mismatches, - }) - .collect(); - unique_sort(&mut annx); + let mut annx = semi; + annx.sort_by(Alignment::cmp_by_tig_start_len); + annx.dedup(); if !allow_improper { delete_improper_matches(&mut annx); @@ -522,18 +507,7 @@ pub fn annotate_seq_core( remove_subsumed_extended_alignments(&refdata.rheaders, &mut annx); - // Transform. - - ann.clear(); - for x in &annx { - ann.push(Annotation { - tig_start: x.tig_start, - match_len: x.match_len, - ref_id: x.ref_id, - ref_start: x.ref_start, - mismatches: x.mismatches.len() as i32, - }); - } + annx.into_iter().map(Into::into).collect() } /// Find maximal perfect matches of length >= 20, or 12 for J regions, so long @@ -543,8 +517,8 @@ fn find_perfect_matches_initial( refdata: &RefData, b_seq: &[u8], allow_weak: bool, -) -> Vec { - let mut perf = Vec::::new(); +) -> Vec { + let mut perf = Vec::::new(); for l in 0..(b.len() - K + 1) { let x: Kmer12 = b.get_kmer(l); let low = lower_bound1_3(&refdata.rkmers_plus, &x) as usize; @@ -593,11 +567,12 @@ fn find_perfect_matches_initial( } } if ok { - perf.push(PerfectMatch { + perf.push(Alignment { ref_id: t as i32, - offset: p as i32 - l as i32, + ref_start: p as i32, tig_start: l as i32, len: len as i32, + mismatches: Vec::new(), }); } } @@ -611,13 +586,19 @@ fn find_perfect_matches_initial( fn find_additional_perfect_matches( b_seq: &[u8], refs: &[DnaString], - perf: &[PerfectMatch], -) -> Vec { + perf: &[Alignment], +) -> Vec { + #[derive(PartialEq, Eq, PartialOrd, Ord)] + struct Offset { + pub ref_id: i32, + pub offset: i32, + } + let mut offsets = Vec::::new(); for p in perf { offsets.push(Offset { ref_id: p.ref_id, - offset: p.offset, + offset: p.offset(), }); } unique_sort(&mut offsets); @@ -632,7 +613,7 @@ fn find_additional_perfect_matches( let mut tig_starts = Vec::::new(); let mut total = 0; for pi in perf { - if pi.ref_id == t && pi.offset == off { + if pi.ref_id == t && pi.offset() == off { tig_starts.push(pi.tig_start); total += pi.len; } @@ -671,11 +652,12 @@ fn find_additional_perfect_matches( } } if !known { - new_matches.push(PerfectMatch { + new_matches.push(Alignment { ref_id: t, - offset: p - l, + ref_start: p, tig_start: l, len, + mismatches: Vec::new(), }); } } @@ -688,18 +670,15 @@ fn find_additional_perfect_matches( } /// Merge perfect matches. We track the positions on b of mismatches. -fn merge_perfect_matches( - b_seq: &[u8], - refs: &[DnaString], - perf: Vec, -) -> Vec { - let mut semi = Vec::::new(); +fn merge_perfect_matches(b_seq: &[u8], refs: &[DnaString], perf: Vec) -> Vec { + let mut semi = Vec::::new(); let mut i = 0; while i < perf.len() { + assert!(perf[i].mismatches.is_empty()); let j = next_diff_any(&perf, i, |a, b| { - a.ref_id == b.ref_id && a.offset == b.offset + a.ref_id == b.ref_id && a.offset() == b.offset() }); - let (t, off) = (perf[i].ref_id, perf[i].offset); + let (t, off) = (perf[i].ref_id, perf[i].offset()); let mut join = vec![false; j - i]; let mut mis = vec![Vec::::new(); j - i]; for k in i..j - 1 { @@ -733,9 +712,9 @@ fn merge_perfect_matches( m.append(&mut mis[k2 - i].clone()); k2 += 1; } - semi.push(SemiPerfectMatch { + semi.push(Alignment { ref_id: t, - offset: off, + ref_start: perf[k1].ref_start, tig_start: perf[k1].tig_start, len: perf[k2].tig_start + perf[k2].len - perf[k1].tig_start, mismatches: m, @@ -748,10 +727,10 @@ fn merge_perfect_matches( } /// Extend matches backwards and then forwards. -fn extend_matches(b_seq: &[u8], refs: &[DnaString], semi: &mut [SemiPerfectMatch]) { +fn extend_matches(b_seq: &[u8], refs: &[DnaString], semi: &mut [Alignment]) { for s in &mut *semi { let t = s.ref_id; - let off = s.offset; + let off = s.offset(); let mut l = s.tig_start; let mut len = s.len; let mut mis = s.mismatches.clone(); @@ -801,6 +780,7 @@ fn extend_matches(b_seq: &[u8], refs: &[DnaString], semi: &mut [SemiPerfectMatch len += 1; } } + s.ref_start = l + off; s.tig_start = l; s.len = len; s.mismatches = mis; @@ -822,18 +802,14 @@ fn extend_matches(b_seq: &[u8], refs: &[DnaString], semi: &mut [SemiPerfectMatch /// we're not sure if it arose from supported mouse strains or if the V segment might have /// been corrupted during the growth of the cell line. The A20 heavy chain V differs by 20% /// from the reference. -fn annotate_40mers_for_mouse_a20( - b_seq: &[u8], - refs: &[DnaString], - semi: &mut Vec, -) { +fn annotate_40mers_for_mouse_a20(b_seq: &[u8], refs: &[DnaString], semi: &mut Vec) { let mut i = 0; while i < semi.len() { let mut j = i + 1; let t = semi[i].ref_id; - let off = semi[i].offset; + let off = semi[i].offset(); while j < semi.len() { - if semi[j].ref_id != t || semi[j].offset != off { + if semi[j].ref_id != t || semi[j].offset() != off { break; } j += 1; @@ -861,9 +837,9 @@ fn annotate_40mers_for_mouse_a20( x.push(l + m); } } - semi.push(SemiPerfectMatch { + semi.push(Alignment { ref_id: t, - offset: off, + ref_start: p, tig_start: p - off, len: L, mismatches: x, @@ -874,21 +850,17 @@ fn annotate_40mers_for_mouse_a20( } i = j; } - semi.sort(); + semi.sort_by(Alignment::cmp_by_ref_id_offset); } /// Allow extension over some mismatches on right if it gets us to the end on /// the reference. Ditto for left. /// ◼ Not documented in main function docstring. -fn extend_matches_to_end_of_reference( - b_seq: &[u8], - refs: &[DnaString], - semi: &mut [SemiPerfectMatch], -) { +fn extend_matches_to_end_of_reference(b_seq: &[u8], refs: &[DnaString], semi: &mut [Alignment]) { let max_mis = 5; for s in &mut *semi { let t = s.ref_id; - let off = s.offset; + let off = s.offset(); let l = s.tig_start; let mut len = s.len; let mut mis = s.mismatches.clone(); @@ -907,7 +879,7 @@ fn extend_matches_to_end_of_reference( } for s in &mut *semi { let t = s.ref_id; - let off = s.offset; + let off = s.offset(); let mut l = s.tig_start; let mut len = s.len; let mut mis = s.mismatches.clone(); @@ -921,6 +893,7 @@ fn extend_matches_to_end_of_reference( len += 1; } if mis_count <= max_mis && l + off == 0 { + s.ref_start = l + off; s.tig_start = l; s.len = len; s.mismatches = mis; @@ -935,19 +908,19 @@ fn extend_matches_to_end_of_reference( /// /// This is pretty crappy. What we should do instead is arrange the initial /// extension between match blocks so it can be iterated. -fn extend_between_match_blocks(b_seq: &[u8], refs: &[DnaString], semi: &mut Vec) { +fn extend_between_match_blocks(b_seq: &[u8], refs: &[DnaString], semi: &mut Vec) { let mut to_delete = vec![false; semi.len()]; for i1 in 0..semi.len() { let t1 = semi[i1].ref_id; if t1 < 0 { continue; } - let off1 = semi[i1].offset; + let off1 = semi[i1].offset(); let (l1, len1) = (semi[i1].tig_start, semi[i1].len); let mis1 = semi[i1].mismatches.clone(); for i2 in 0..semi.len() { let t2 = semi[i2].ref_id; - let off2 = semi[i2].offset; + let off2 = semi[i2].offset(); if t2 != t1 || off2 != off1 { continue; } @@ -978,13 +951,13 @@ fn extend_between_match_blocks(b_seq: &[u8], refs: &[DnaString], semi: &mut Vec< } /// Merge overlapping alignments. -fn merge_overlapping_alignments(semi: &mut Vec) { +fn merge_overlapping_alignments(semi: &mut Vec) { let mut to_delete = vec![false; semi.len()]; let mut i = 0; while i < semi.len() { let mut j = i + 1; while j < semi.len() { - if semi[j].ref_id != semi[i].ref_id || semi[j].offset != semi[i].offset { + if semi[j].ref_id != semi[i].ref_id || semi[j].offset() != semi[i].offset() { break; } j += 1; @@ -994,6 +967,7 @@ fn merge_overlapping_alignments(semi: &mut Vec) { if to_delete[k1] || to_delete[k2] { continue; } + let offset = semi[k1].offset(); let start1 = semi[k1].tig_start; let start2 = semi[k2].tig_start; let len1 = semi[k1].len; @@ -1003,6 +977,7 @@ fn merge_overlapping_alignments(semi: &mut Vec) { let start = min(start1, start2); let stop = max(stop1, stop2); if stop - start <= len1 + len2 { + semi[k1].ref_start = start + offset; semi[k1].tig_start = start; semi[k1].len = stop - start; let mut m2 = semi[k2].mismatches.clone(); @@ -1020,7 +995,7 @@ fn merge_overlapping_alignments(semi: &mut Vec) { /// If a V gene aligns starting at 0, and goes at least 60% of the way to the end, and there /// is only one alignment of the V gene, extend it to the end. /// (Only one requirement ameliorated.) -fn extend_long_v_gene_alignments(b_seq: &[u8], refdata: &RefData, semi: &mut [SemiPerfectMatch]) { +fn extend_long_v_gene_alignments(b_seq: &[u8], refdata: &RefData, semi: &mut [Alignment]) { let mut i = 0; while i < semi.len() { let mut j = i + 1; @@ -1041,8 +1016,8 @@ fn extend_long_v_gene_alignments(b_seq: &[u8], refdata: &RefData, semi: &mut [Se } } if ok { - let offset = semi[k].offset; - let ref_start = semi[k].offset + semi[k].tig_start; + let offset = semi[k].offset(); + let ref_start = semi[k].offset() + semi[k].tig_start; let tig_start = semi[k].tig_start; let t = semi[k].ref_id as usize; if !refdata.rheaders[t].contains("segment") && refdata.is_v(t) { @@ -1073,21 +1048,21 @@ fn extend_long_v_gene_alignments(b_seq: &[u8], refdata: &RefData, semi: &mut [Se /// Delete some subsumed matches. // FIXME: collapse this and remove_subsumed_alignments below once the match // types are collapsed. They have slightly different behavior. -fn remove_subsumed_matches(semi: &mut Vec) { +fn remove_subsumed_matches(semi: &mut Vec) { let mut to_delete = vec![false; semi.len()]; let mut i = 0; while i < semi.len() { let mut j = i + 1; while j < semi.len() { - if semi[j].ref_id != semi[i].ref_id || semi[j].offset != semi[i].offset { + if semi[j].ref_id != semi[i].ref_id || semi[j].offset() != semi[i].offset() { break; } j += 1; } for k1 in i..j { for k2 in i..j { - if semi[k1].offset + semi[k1].tig_start + semi[k1].len - == semi[k2].offset + semi[k2].tig_start + semi[k2].len + if semi[k1].offset() + semi[k1].tig_start + semi[k1].len + == semi[k2].offset() + semi[k2].tig_start + semi[k2].len && semi[k1].len > semi[k2].len { to_delete[k2] = true; @@ -1100,21 +1075,10 @@ fn remove_subsumed_matches(semi: &mut Vec) { } /// Delete matches that are 'too improper'. -fn delete_improper_matches(annx: &mut Vec) { +fn delete_improper_matches(annx: &mut Vec) { let mut to_delete: Vec = vec![false; annx.len()]; // Re-sort the annotations by ref_id - annx.sort_by(|a, b| { - let key: for<'a> fn(&'a PreAnnotation) -> (_, _, _, _, &'a Vec<_>) = |x: &PreAnnotation| { - ( - x.ref_id, - x.ref_start, - x.tig_start, - x.match_len, - &x.mismatches, - ) - }; - key(a).cmp(&key(b)) - }); + annx.sort_by(Alignment::cmp_by_ref_id_ref_start); let mut i1 = 0; loop { if i1 == annx.len() { @@ -1136,7 +1100,7 @@ fn delete_improper_matches(annx: &mut Vec) { } erase_if(annx, &to_delete); // Re-sort using standard sort order. - annx.sort(); + annx.sort_by(Alignment::cmp_by_tig_start_len); } /// Amongst V segments starting at zero on the V segment, if some start with @@ -1144,7 +1108,7 @@ fn delete_improper_matches(annx: &mut Vec) { fn retain_head_v_segments_with_start_codon( b_seq: &[u8], refdata: &RefData, - annx: &mut Vec, + annx: &mut Vec, ) { let mut have_starter = false; for annxi in annx.iter() { @@ -1199,7 +1163,7 @@ fn remove_inferior_matches( rheaders: &[String], verbose: bool, log: &mut Vec, - annx: &mut Vec, + annx: &mut Vec, ) { let mut to_delete: Vec = vec![false; annx.len()]; let mut ts: Vec<(usize, usize)> = annx @@ -1213,14 +1177,14 @@ fn remove_inferior_matches( let j1 = next_diff1_2(&ts, i1); let mut tlen1 = 0; for k in i1..j1 { - tlen1 += annx[ts[k].1].match_len; + tlen1 += annx[ts[k].1].len; } let mut i2 = 0; while i2 < ts.len() { let j2 = next_diff1_2(&ts, i2); let mut tlen2 = 0; for k in i2..j2 { - tlen2 += annx[ts[k].1].match_len; + tlen2 += annx[ts[k].1].len; } let (mut m1, mut m2) = (0, 0); let mut over = 0_i64; @@ -1241,11 +1205,11 @@ fn remove_inferior_matches( for k1 in i1..j1 { let u1 = ts[k1].1; let l1 = annx[u1].tig_start; - let len1 = annx[u1].match_len; + let len1 = annx[u1].len; for t in &ts[i2..j2] { let u2 = t.1; let l2 = annx[u2].tig_start; - let len2 = annx[u2].match_len; + let len2 = annx[u2].len; let start = max(l1, l2); let stop = min(l1 + len1, l2 + len2); if start >= stop { @@ -1296,7 +1260,7 @@ fn remove_inferior_matches( let t = item.0; let u = item.1; let l = annx[u].tig_start; - let len = annx[u].match_len; + let len = annx[u].len; let p = annx[u].ref_start; let mis = annx[u].mismatches.len(); fwriteln!( @@ -1315,7 +1279,7 @@ fn remove_inferior_matches( let t = item.0; let u = item.1; let l = annx[u].tig_start; - let len = annx[u].match_len; + let len = annx[u].len; let p = annx[u].ref_start; let mis = annx[u].mismatches.len(); fwriteln!( @@ -1355,7 +1319,7 @@ fn find_indels_in_v_or_utr( refdata: &RefData, verbose: bool, log: &mut Vec, - annx: &mut Vec, + annx: &mut Vec, ) { let mut to_delete: Vec = vec![false; annx.len()]; let mut aligns = vec![0; refdata.refs.len()]; @@ -1382,7 +1346,7 @@ fn find_indels_in_v_or_utr( if l1 >= l2 { continue; } - let (mut len1, mut len2) = (annx[i1].match_len as usize, annx[i2].match_len as usize); + let (mut len1, mut len2) = (annx[i1].len as usize, annx[i2].len as usize); if l1 + len1 > l2 + len2 { continue; } @@ -1407,7 +1371,7 @@ fn find_indels_in_v_or_utr( } } mis.append(&mut annx[i2].mismatches.clone()); - annx[i1].match_len = tot1 as i32; + annx[i1].len = tot1 as i32; annx[i1].mismatches = mis; to_delete[i2] = true; continue; @@ -1444,9 +1408,9 @@ fn find_indels_in_v_or_utr( best_ipos = ipos; } } - annx[i1].match_len = best_ipos - start1; + annx[i1].len = best_ipos - start1; annx[i1].mismatches = best_mis1; - annx[i2].match_len = stop1 - best_ipos - ins; + annx[i2].len = stop1 - best_ipos - ins; annx[i2].tig_start = best_ipos + ins; annx[i2].ref_start = best_ipos + ins + off2; annx[i2].mismatches = best_mis2; @@ -1486,10 +1450,10 @@ fn find_indels_in_v_or_utr( best_dpos = dpos; } } - annx[i1].match_len = best_dpos - start2; + annx[i1].len = best_dpos - start2; annx[i1].mismatches = best_mis1; annx[i2].tig_start = best_dpos + del - off2; - annx[i2].match_len = stop2 - best_dpos - del; + annx[i2].len = stop2 - best_dpos - del; annx[i2].ref_start = best_dpos + del; annx[i2].mismatches = best_mis2; continue; @@ -1592,8 +1556,8 @@ fn find_indels_in_v_or_utr( } if del.len() + ins.len() == 0 { to_delete[i2] = true; - len1 = (annx[i2].tig_start + annx[i2].match_len - annx[i1].tig_start) as usize; - annx[i1].match_len = len1 as i32; + len1 = (annx[i2].tig_start + annx[i2].len - annx[i1].tig_start) as usize; + annx[i1].len = len1 as i32; annx[i1].mismatches.truncate(0); for j in 0..len1 { if b_seq[l1 + j] != refdata.refs[t].get(p1 + j) { @@ -1603,8 +1567,8 @@ fn find_indels_in_v_or_utr( } if del.len() + ins.len() == 1 { annx[i2].tig_start = l2 as i32; - annx[i1].match_len = len1 as i32; - annx[i2].match_len = len2 as i32; + annx[i1].len = len1 as i32; + annx[i2].len = len2 as i32; annx[i2].ref_start = p2 as i32; annx[i1].mismatches.truncate(0); annx[i2].mismatches.truncate(0); @@ -1642,7 +1606,7 @@ fn retain_best_alignment( refdata: &RefData, verbose: bool, log: &mut Vec, - annx: &mut Vec, + annx: &mut Vec, ) { let mut combo = Vec::<(String, i32, usize)>::new(); for (i, a) in annx.iter().enumerate() { @@ -1671,7 +1635,7 @@ fn retain_best_alignment( locs.push(combo[k].2); let a = &annx[combo[k].2]; rstarts.push(a.ref_start as usize); - cov.push((a.tig_start as usize, (a.tig_start + a.match_len) as usize)); + cov.push((a.tig_start as usize, (a.tig_start + a.len) as usize)); mis += a.mismatches.len(); if !refdata.is_u(a.ref_id as usize) { mis_nutr += a.mismatches.len(); @@ -1981,7 +1945,7 @@ fn retain_best_alignment( /// Delete some subsumed alignments. // FIXME: collapse this and remove_subsumed_matches once the match types are // collapsed. They have slightly different behavior. -fn remove_subsumed_alignments(annx: &mut Vec) { +fn remove_subsumed_alignments(annx: &mut Vec) { let mut to_delete = vec![false; annx.len()]; let mut i = 0; while i < annx.len() { @@ -1996,7 +1960,7 @@ fn remove_subsumed_alignments(annx: &mut Vec) { for k2 in i..j { if annx[k1].ref_id == annx[k2].ref_id && annx[k1].ref_start == annx[k2].ref_start - && annx[k1].match_len > annx[k2].match_len + && annx[k1].len > annx[k2].len { to_delete[k2] = true; } @@ -2009,14 +1973,14 @@ fn remove_subsumed_alignments(annx: &mut Vec) { } /// Extend some alignments. -fn extend_alignments(b_seq: &[u8], refs: &[DnaString], annx: &mut [PreAnnotation]) { +fn extend_alignments(b_seq: &[u8], refs: &[DnaString], annx: &mut [Alignment]) { let mut aligns = vec![0; refs.len()]; for a in annx.iter() { aligns[a.ref_id as usize] += 1; } for a in annx { let t = a.ref_id as usize; - let len = a.match_len as usize; + let len = a.len as usize; if aligns[t] == 1 && a.ref_start == 0 && len < refs[t].len() @@ -2029,7 +1993,7 @@ fn extend_alignments(b_seq: &[u8], refs: &[DnaString], annx: &mut [PreAnnotation a.mismatches.push(q); } } - a.match_len = refs[t].len() as i32; + a.len = refs[t].len() as i32; } } } @@ -2040,12 +2004,12 @@ fn retain_longer_v_alignments( refdata: &RefData, verbose: bool, log: &mut Vec, - annx: &mut Vec, + annx: &mut Vec, ) { let mut lens = vec![0; refdata.refs.len()]; for a in annx.iter() { let t = a.ref_id as usize; - lens[t] += a.ref_start + a.match_len; + lens[t] += a.ref_start + a.len; } let mut to_delete: Vec = vec![false; annx.len()]; for i1 in 0..annx.len() { @@ -2085,7 +2049,7 @@ fn retain_longer_v_alignments( /// assume that the J aligns up to exactly the point where the C starts, or to /// one base after. We require that the last 20 bases of the J match with at /// most 5 mismatches. -fn annotate_j_for_ig_with_c_and_v(b_seq: &[u8], refdata: &RefData, annx: &mut Vec) { +fn annotate_j_for_ig_with_c_and_v(b_seq: &[u8], refdata: &RefData, annx: &mut Vec) { let (mut igv, mut igj) = (false, false); let mut igc = -1_i32; const J_TOT: i32 = 20; @@ -2149,21 +2113,21 @@ fn annotate_j_for_ig_with_c_and_v(b_seq: &[u8], refdata: &RefData, annx: &mut Ve mis.push(i + j); } } - annx.push(PreAnnotation { + annx.push(Alignment { tig_start: i, - match_len: n, + len: n, ref_id: best_t, ref_start: 0, mismatches: mis, }); - annx.sort(); + annx.sort_by(Alignment::cmp_by_tig_start_len); } } } /// If there is a D gene alignment that is from a different chain type than the V gene /// alignment, delete it. -fn delete_d_if_chain_doesnt_match_v(refdata: &RefData, annx: &mut Vec) { +fn delete_d_if_chain_doesnt_match_v(refdata: &RefData, annx: &mut Vec) { let mut to_delete: Vec = vec![false; annx.len()]; for i1 in 0..annx.len() { let t1 = annx[i1].ref_id as usize; @@ -2197,7 +2161,7 @@ fn annotate_d_between_v_j( b_seq: &[u8], b: &DnaString, refdata: &RefData, - annx: &mut Vec, + annx: &mut Vec, ) { let (mut v, mut d, mut j) = (false, false, false); let (mut vstop, mut jstart) = (0, 0); @@ -2211,7 +2175,7 @@ fn annotate_d_between_v_j( if rt == 0 || rt == 4 || rt == 5 { if refdata.segtype[t] == "V" { v = true; - vstop = ann.tig_start + ann.match_len; + vstop = ann.tig_start + ann.len; v_rtype = rt; } else if refdata.segtype[t] == "D" { d = true; @@ -2261,14 +2225,14 @@ fn annotate_d_between_v_j( best_matches = max(best_matches, result.1); } if results[0].1 == best_matches { - annx.push(PreAnnotation { + annx.push(Alignment { tig_start: results[0].4 as i32, - match_len: (results[0].0 + results[0].1) as i32, + len: (results[0].0 + results[0].1) as i32, ref_id: results[0].2 as i32, ref_start: 0, mismatches: results[0].5.clone(), }); - annx.sort(); + annx.sort_by(Alignment::cmp_by_tig_start_len); } } } @@ -2277,7 +2241,7 @@ fn annotate_d_between_v_j( /// A J segment that goes up to its end beats any J segment that doesn't. /// If they both go up to the end, choose. -fn retain_longer_j_segment(b_seq: &[u8], refdata: &RefData, annx: &mut Vec) { +fn retain_longer_j_segment(b_seq: &[u8], refdata: &RefData, annx: &mut Vec) { let mut to_delete: Vec = vec![false; annx.len()]; for i1 in 0..annx.len() { for i2 in 0..annx.len() { @@ -2289,7 +2253,7 @@ fn retain_longer_j_segment(b_seq: &[u8], refdata: &RefData, annx: &mut Vec, + annx: &mut Vec, ) { let mut to_delete: Vec = vec![false; annx.len()]; for i1 in 0..annx.len() { @@ -2400,7 +2364,7 @@ fn retain_better_v_segment( b_seq: &[u8], b: &DnaString, refdata: &RefData, - annx: &mut Vec, + annx: &mut Vec, ) { let mut nv = 0; for a in annx.iter() { @@ -2467,7 +2431,7 @@ fn retain_better_v_segment( } // Remove UTR annotations that have no matching V annotation. -fn remove_utr_without_matching_v(rheaders: &[String], annx: &mut Vec) { +fn remove_utr_without_matching_v(rheaders: &[String], annx: &mut Vec) { let mut to_delete: Vec = vec![false; annx.len()]; let (mut u, mut v) = (Vec::::new(), Vec::::new()); for a in annx.iter() { @@ -2502,7 +2466,7 @@ fn remove_utr_without_matching_v(rheaders: &[String], annx: &mut Vec) { +fn retain_much_better_aligned_v_segment(rheaders: &[String], annx: &mut Vec) { let mut to_delete: Vec = vec![false; annx.len()]; let mut vs = Vec::<(usize, usize)>::new(); for (i, a) in annx.iter().enumerate() { @@ -2523,13 +2487,13 @@ fn retain_much_better_aligned_v_segment(rheaders: &[String], annx: &mut Vec) { +fn remove_subsumed_alignments_2(annx: &mut Vec) { let mut to_delete: Vec = vec![false; annx.len()]; for i1 in 0..annx.len() { for i2 in 0..annx.len() { @@ -2579,7 +2543,7 @@ fn remove_subsumed_alignments_2(annx: &mut Vec) { continue; } let (l1, l2) = (annx[i1].tig_start, annx[i2].tig_start); - let (len1, len2) = (annx[i1].match_len, annx[i2].match_len); + let (len1, len2) = (annx[i1].len, annx[i2].len); let (p1, p2) = (annx[i1].ref_start, annx[i2].ref_start); if l1 != l2 || p1 != p2 { continue; @@ -2593,7 +2557,7 @@ fn remove_subsumed_alignments_2(annx: &mut Vec) { } /// If we see TRBJ1 and not TRBJ2, delete any TRBC2. And conversely. -fn remove_unmatched_trbc(rheaders: &[String], annx: &mut Vec) { +fn remove_unmatched_trbc(rheaders: &[String], annx: &mut Vec) { let mut to_delete: Vec = vec![false; annx.len()]; let (mut j1, mut j2) = (false, false); for a in annx.iter() { @@ -2618,7 +2582,7 @@ fn remove_unmatched_trbc(rheaders: &[String], annx: &mut Vec) { } /// Pick between equally performant Js and likewise for Cs. -fn downselect_equally_performant_j_and_c(refdata: &RefData, annx: &mut Vec) { +fn downselect_equally_performant_j_and_c(refdata: &RefData, annx: &mut Vec) { let mut to_delete = vec![false; annx.len()]; for pass in 0..2 { for i1 in 0..annx.len() { @@ -2640,7 +2604,7 @@ fn downselect_equally_performant_j_and_c(refdata: &RefData, annx: &mut Vec) { +fn downselect_to_best_c(rheaders: &[String], annx: &mut Vec) { let mut to_delete = vec![false; annx.len()]; for i1 in 0..annx.len() { let t1 = annx[i1].ref_id; @@ -2681,7 +2645,7 @@ fn downselect_to_best_c(rheaders: &[String], annx: &mut Vec) { continue; } let (l1, l2) = (annx[i1].tig_start as usize, annx[i2].tig_start as usize); - let (len1, len2) = (annx[i1].match_len as usize, annx[i2].match_len as usize); + let (len1, len2) = (annx[i1].len as usize, annx[i2].len as usize); // let (p1,p2) = (annx[i1].3,annx[i2].3); if l1 + len1 != l2 + len2 { continue; @@ -2697,15 +2661,15 @@ fn downselect_to_best_c(rheaders: &[String], annx: &mut Vec) { /// Remove some subsumed extended annotations. // FIXME: collapse this and the other remove subsumed alignments functions. -fn remove_subsumed_extended_alignments(rheaders: &[String], annx: &mut Vec) { +fn remove_subsumed_extended_alignments(rheaders: &[String], annx: &mut Vec) { let mut to_delete: Vec = vec![false; annx.len()]; for i1 in 0..annx.len() { let l1 = annx[i1].tig_start as usize; - let len1 = annx[i1].match_len as usize; + let len1 = annx[i1].len as usize; for i2 in 0..annx.len() { let t2 = annx[i2].ref_id as usize; let l2 = annx[i2].tig_start as usize; - let len2 = annx[i2].match_len as usize; + let len2 = annx[i2].len as usize; if len2 >= len1 { continue; } @@ -2780,17 +2744,7 @@ pub fn print_annotations( abut: bool, verbose: bool, ) { - let mut ann = Vec::::new(); - annotate_seq_core( - b, - refdata, - &mut ann, - true, - allow_improper, - abut, - log, - verbose, - ); + let ann = annotate_seq_core(b, refdata, true, allow_improper, abut, log, verbose); print_some_annotations(refdata, &ann, log, verbose); } @@ -3472,8 +3426,7 @@ impl ContigAnnotation { is_cell: bool, // was the barcode declared a cell? jsupp: Option, // num reads, umis supporting junction ) -> ContigAnnotation { - let mut ann = Vec::::new(); - annotate_seq(b, refdata, &mut ann, true, false, true); + let ann = annotate_seq(b, refdata, true, false, true); let (is_productive, productive_criteria) = is_productive_contig(b, refdata, &ann); ContigAnnotation::from_annotate_seq( b, diff --git a/vdj_ann_ref/src/lib.rs b/vdj_ann_ref/src/lib.rs index af4ede428..f4d395e77 100644 --- a/vdj_ann_ref/src/lib.rs +++ b/vdj_ann_ref/src/lib.rs @@ -116,8 +116,6 @@ pub fn make_vdj_ref_data( #[cfg(test)] mod tests { - use vdj_ann::annotate::Annotation; - use super::*; // The following test checks for alignment of a D region. This example was fixed by code @@ -141,8 +139,7 @@ mod tests { let (is_tcr, is_bcr) = (true, false); let mut refdata = RefData::new(); make_vdj_ref_data_core(&mut refdata, refx, &ext_refx, is_tcr, is_bcr, None); - let mut ann = Vec::::new(); - annotate_seq(&seq, &refdata, &mut ann, true, false, true); + let ann = annotate_seq(&seq, &refdata, true, false, true); let mut have_d = false; for a in ann { if refdata.is_d(a.ref_id as usize) {