From 3a2f1e6c30f8a0a161258e73583cb505c4af30c2 Mon Sep 17 00:00:00 2001 From: Erik Borgstrom Date: Fri, 15 Mar 2024 10:33:13 +0100 Subject: [PATCH] refactor: clarify CDR3 annotation algorithms (#399) --- amino/src/lib.rs | 10 +- enclone/src/info.rs | 8 +- enclone/src/misc2.rs | 4 +- enclone_args/src/read_json.rs | 20 +-- enclone_core/src/mammalian_fixed_len.rs | 4 +- enclone_print/src/print_utils2.rs | 16 +- enclone_print/src/proc_cvar_auto.rs | 28 +-- enclone_print/src/proc_lvar_auto.rs | 62 +++---- enclone_stuff/src/flag_defective.rs | 14 +- enclone_stuff/src/populate_features.rs | 4 +- vdj_ann/src/annotate.rs | 228 ++++++++++++++---------- vdj_ann/src/transcript.rs | 9 +- 12 files changed, 231 insertions(+), 176 deletions(-) diff --git a/amino/src/lib.rs b/amino/src/lib.rs index d9e456659..7e029eae4 100644 --- a/amino/src/lib.rs +++ b/amino/src/lib.rs @@ -132,14 +132,14 @@ pub fn codon_to_aa(codon: &[u8]) -> u8 { // Convert a given DNA sequence to amino acids, starting at a given position. -pub fn aa_seq(x: &[u8], start: usize) -> Vec { +pub fn nucleotide_to_aminoacid_sequence(dna_seq: &[u8], start: usize) -> Vec { let mut a = Vec::::new(); - if x.len() >= 3 { - for j in (start..x.len() - 3 + 1).step_by(3) { - if x[j] == b'-' && x[j + 1] == b'-' && x[j + 2] == b'-' { + if dna_seq.len() >= 3 { + for pos in (start..dna_seq.len() - 3 + 1).step_by(3) { + if dna_seq[pos] == b'-' && dna_seq[pos + 1] == b'-' && dna_seq[pos + 2] == b'-' { a.push(b'-'); } else { - a.push(codon_to_aa(&x[j..j + 3])); + a.push(codon_to_aa(&dna_seq[pos..pos + 3])); } } } diff --git a/enclone/src/info.rs b/enclone/src/info.rs index b52c881ce..e08f34c41 100644 --- a/enclone/src/info.rs +++ b/enclone/src/info.rs @@ -6,7 +6,7 @@ use enclone_core::barcode_fate::BarcodeFate; use vdj_ann::refx; use self::refx::RefData; -use amino::{aa_seq, codon_to_aa}; +use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence}; use ansi_escape::emit_end_escape; use debruijn::{dna_string::DnaString, Mer}; use enclone_core::defs::{CloneInfo, EncloneControl, ExactClonotype}; @@ -130,8 +130,8 @@ pub fn build_info( // Optimize to compute entry in aa_mod_indel and the inserted aa sequence. - let aa_full = aa_seq(&x.seq, 0); - let ref_aa = aa_seq(&refdata.refs[vid].to_ascii_vec(), 0); + let aa_full = nucleotide_to_aminoacid_sequence(&x.seq, 0); + let ref_aa = nucleotide_to_aminoacid_sequence(&refdata.refs[vid].to_ascii_vec(), 0); let ins_len_aa = ins_len / 3; const EXT: usize = 10; let ins_pos_low = if ins_pos / 3 < EXT { @@ -180,7 +180,7 @@ pub fn build_info( lens.push(x.seq.len()); tigs.push(x.seq.clone()); tigs_amino.push(x.seq.clone()); - aa_mod_indel.push(aa_seq(&x.seq, 0)); + aa_mod_indel.push(nucleotide_to_aminoacid_sequence(&x.seq, 0)); tigs_ins.push(Vec::new()); } diff --git a/enclone/src/misc2.rs b/enclone/src/misc2.rs index 7d6d3d010..911ace84e 100644 --- a/enclone/src/misc2.rs +++ b/enclone/src/misc2.rs @@ -4,7 +4,7 @@ use crate::innate::mark_innate; use crate::misc3::study_consensus; -use amino::aa_seq; +use amino::nucleotide_to_aminoacid_sequence; use debruijn::dna_string::DnaString; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{EncloneControl, ExactClonotype, Junction, TigData, TigData0, TigData1}; @@ -180,7 +180,7 @@ pub fn create_exact_subclonotype_core( // are the same, which in principle they should be, but this is not actually always true. // However this is hard to fix. - let aa = aa_seq(tig_bc[r][m].seq(), 0); + let aa = nucleotide_to_aminoacid_sequence(tig_bc[r][m].seq(), 0); let mut d_start = None; if tig_bc[r][m].d_start.is_some() { d_start = Some(tig_bc[r][m].d_start.unwrap() + utr.len() - tig_bc[r][m].v_start); diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index a2e9880a3..d66fceb31 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -191,10 +191,10 @@ fn process_json_annotation( } else if !is_productive_contig(&x, refdata, &ann1).0 { return Ok(res); } - let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - get_cdr3_using_ann(&x, refdata, &ann1, &mut cdr3); - cdr3_aa = stringme(&cdr3[0].1); - cdr3_start = cdr3[0].0; + let found_cdr3s = get_cdr3_using_ann(&x, refdata, &ann1); + let cdr3 = found_cdr3s.first().unwrap(); + cdr3_aa = stringme(&cdr3.aa_seq); + cdr3_start = cdr3.start_position_on_contig; cdr3_dna = x .slice(cdr3_start, cdr3_start + 3 * cdr3_aa.len()) .to_string(); @@ -403,23 +403,23 @@ fn process_json_annotation( // than is used in the current version of enclone. This could result in internal // inconsistencies, leading to an assert somewhere downstream. - let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); let x = DnaString::from_dna_string(&ann.sequence); - get_cdr3_using_ann(&x, refdata, &annv, &mut cdr3); - if cdr3.is_empty() { + let found_cdr3s = get_cdr3_using_ann(&x, refdata, &annv); + if found_cdr3s.is_empty() { return Ok(res); } - let cdr3_aa_alt = stringme(&cdr3[0].1); + let cdr3 = found_cdr3s.first().unwrap(); + let cdr3_aa_alt = stringme(&cdr3.aa_seq); if cdr3_aa != cdr3_aa_alt { // This is particularly pathological and rare: - if tig_start as usize > cdr3[0].0 { + if tig_start as usize > cdr3.start_position_on_contig { return Ok(res); } // Define start. - cdr3_start = cdr3[0].0 - tig_start as usize; + cdr3_start = cdr3.start_position_on_contig - tig_start as usize; // Define cdr3. diff --git a/enclone_core/src/mammalian_fixed_len.rs b/enclone_core/src/mammalian_fixed_len.rs index 15ba7037d..6ab76175c 100644 --- a/enclone_core/src/mammalian_fixed_len.rs +++ b/enclone_core/src/mammalian_fixed_len.rs @@ -1,6 +1,6 @@ // Copyright (c) 2021 10X Genomics, Inc. All rights reserved. -use amino::aa_seq; +use amino::nucleotide_to_aminoacid_sequence; use std::collections::HashMap; use string_utils::TextUtils; use superslice::Ext; @@ -54,7 +54,7 @@ pub fn mammalian_fixed_len_peer_groups(refdata: &RefData) -> Vec::new(); refdata.refs.len()]; for (i, pgi) in pg.iter_mut().enumerate() { if refdata.is_v(i) { - let aa = aa_seq(&refdata.refs[i].to_ascii_vec(), 0); + let aa = nucleotide_to_aminoacid_sequence(&refdata.refs[i].to_ascii_vec(), 0); let rtype = refdata.rtype[i]; let chain_type = if rtype == 0 { "IGH" diff --git a/enclone_print/src/print_utils2.rs b/enclone_print/src/print_utils2.rs index 1541c3fdc..ca03c1152 100644 --- a/enclone_print/src/print_utils2.rs +++ b/enclone_print/src/print_utils2.rs @@ -7,7 +7,7 @@ use crate::print_utils1::color_codon; use crate::proc_cvar_auto::proc_cvar_auto; use crate::proc_lvar2::proc_lvar2; use crate::proc_lvar_auto::proc_lvar_auto; -use amino::{aa_seq, codon_to_aa}; +use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence}; use enclone_core::allowed_vars::LVARS_ALLOWED; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{AlleleData, ColInfo, EncloneControl, ExactClonotype, GexInfo, POUT_SEP}; @@ -564,7 +564,12 @@ pub fn row_fill( // Speak some other column entries. let xm = &ex.share[mid]; - speakc!(u, col, "vj_aa".to_string(), stringme(&aa_seq(&xm.seq, 0))); + speakc!( + u, + col, + "vj_aa".to_string(), + stringme(&nucleotide_to_aminoacid_sequence(&xm.seq, 0)) + ); speakc!(u, col, "vj_seq".to_string(), stringme(&xm.seq)); let mut dna = Vec::::new(); for p in xm.fr1_start..xm.seq_del_amino.len() { @@ -578,7 +583,12 @@ pub fn row_fill( dna.push(xm.seq_del_amino[p]); } } - speakc!(u, col, "vj_aa_nl".to_string(), stringme(&aa_seq(&dna, 0))); + speakc!( + u, + col, + "vj_aa_nl".to_string(), + stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)) + ); speakc!(u, col, "vj_seq_nl".to_string(), stringme(&dna)); speakc!(u, col, "seq".to_string(), stringme(&xm.full_seq)); let mut vv = Vec::::new(); diff --git a/enclone_print/src/proc_cvar_auto.rs b/enclone_print/src/proc_cvar_auto.rs index 6cabbb453..d526b1a2f 100644 --- a/enclone_print/src/proc_cvar_auto.rs +++ b/enclone_print/src/proc_cvar_auto.rs @@ -6,7 +6,7 @@ use crate::print_utils1::{ test_internal_error_seq, }; use crate::print_utils3::comp_edit; -use amino::{aa_seq, codon_to_aa}; +use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence}; use enclone_core::align_to_vdj_ref::{align_to_vdj_ref, cigar}; use enclone_core::defs::{AlleleData, ColInfo, EncloneControl, ExactClonotype, POUT_SEP}; use enclone_core::median::rounded_median; @@ -212,7 +212,7 @@ pub fn proc_cvar_auto( let dna = refdata.refs[x.v_ref_id].to_ascii_vec() [x.cdr1_start.unwrap()..x.fr2_start.unwrap()] .to_vec(); - y = stringme(&aa_seq(&dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)); } } else if x.cdr2_start.is_some() && x.fr3_start.is_some() @@ -221,7 +221,7 @@ pub fn proc_cvar_auto( let dna = refdata.refs[x.v_ref_id].to_ascii_vec() [x.cdr2_start.unwrap()..x.fr3_start.unwrap()] .to_vec(); - y = stringme(&aa_seq(&dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)); } (y, Vec::new(), "clono".to_string()) @@ -264,9 +264,9 @@ pub fn proc_cvar_auto( let arg1 = vname.between2("cdr", "_aa").force_i64(); let x = &ex.share[mid]; let y = if arg1 == 1 { - get_cdr1(x, 0, 0).map(|c| stringme(&aa_seq(c.as_bytes(), 0))) + get_cdr1(x, 0, 0).map(|c| stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0))) } else if arg1 == 2 { - get_cdr2(x, 0, 0).map(|c| stringme(&aa_seq(c.as_bytes(), 0))) + get_cdr2(x, 0, 0).map(|c| stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0))) } else { Some(x.cdr3_aa.clone()) } @@ -291,7 +291,7 @@ pub fn proc_cvar_auto( get_cdr3(x, -1, -1) }; let y = if let Some(c) = c { - stringme(&aa_seq(c.as_bytes(), 0)) + stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0)) } else { "unknown".to_string() }; @@ -348,7 +348,7 @@ pub fn proc_cvar_auto( } } test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?; - y = stringme(&aa_seq(&dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)); } } else if arg1 == 2 { if x.cdr2_start.is_some() @@ -372,7 +372,7 @@ pub fn proc_cvar_auto( } } test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?; - y = stringme(&aa_seq(&dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)); } } else if x.cdr3_start as i64 - left >= 0 && x.cdr3_start as i64 - left < x.seq_del_amino.len() as i64 @@ -395,7 +395,7 @@ pub fn proc_cvar_auto( } } test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?; - y = stringme(&aa_seq(&dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)); } (y, Vec::new(), "exact".to_string()) @@ -848,7 +848,7 @@ pub fn proc_cvar_auto( Some(stringme(dna)) }; let y = if let Some(c) = c { - stringme(&aa_seq(c.as_bytes(), 0)) + stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0)) } else { "unknown".to_string() }; @@ -868,21 +868,21 @@ pub fn proc_cvar_auto( let dna = refdata.refs[x.v_ref_id].to_ascii_vec() [x.fr1_start..x.cdr1_start.unwrap()] .to_vec(); - y = stringme(&aa_seq(&dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)); } } else if arg1 == 2 { if x.fr2_start.unwrap() <= x.cdr2_start.unwrap() { let dna = refdata.refs[x.v_ref_id].to_ascii_vec() [x.fr2_start.unwrap()..x.cdr2_start.unwrap()] .to_vec(); - y = stringme(&aa_seq(&dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)); } } else if arg1 == 3 { if x.fr3_start.is_some() && x.fr3_start.unwrap() <= x.cdr3_start - x.ins_len() { let dna = refdata.refs[x.v_ref_id].to_ascii_vec(); if x.cdr3_start <= dna.len() { let dna = dna[x.fr3_start.unwrap()..x.cdr3_start - x.ins_len()].to_vec(); - y = stringme(&aa_seq(&dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0)); } } } else { @@ -890,7 +890,7 @@ pub fn proc_cvar_auto( let aa_len = if heavy { 10 } else { 9 }; let dna = refdata.refs[x.j_ref_id].to_ascii_vec(); let dna = &dna[dna.len() - 1 - 3 * aa_len..dna.len() - 1]; - y = stringme(&aa_seq(dna, 0)); + y = stringme(&nucleotide_to_aminoacid_sequence(dna, 0)); } (y, Vec::new(), "clono".to_string()) diff --git a/enclone_print/src/proc_lvar_auto.rs b/enclone_print/src/proc_lvar_auto.rs index f4edd177c..8cce1db10 100644 --- a/enclone_print/src/proc_lvar_auto.rs +++ b/enclone_print/src/proc_lvar_auto.rs @@ -1,7 +1,7 @@ // Copyright (c) 2021 10x Genomics, Inc. All rights reserved. // This file is auto-generated by the crate enclone_vars, please do not edit. -use amino::{aa_seq, codon_to_aa}; +use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence}; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{ColInfo, EncloneControl, ExactClonotype, GexInfo, POUT_SEP}; use enclone_core::median::{median_f64, rounded_median}; @@ -151,7 +151,7 @@ pub fn proc_lvar_auto( let reg = Regex::new(vname.between2("count_", "")).unwrap(); let mut n = 0; for j in 0..ex.share.len() { - let aa = aa_seq(&ex.share[j].seq, 0); // seems inefficient + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq, 0); // seems inefficient n += reg.find_iter(strme(&aa)).count(); } @@ -168,7 +168,7 @@ pub fn proc_lvar_auto( let reg = Regex::new(vname.between2("count_", "_cell")).unwrap(); let mut n = 0; for j in 0..ex.share.len() { - let aa = aa_seq(&ex.share[j].seq, 0); // seems inefficient + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq, 0); // seems inefficient n += reg.find_iter(strme(&aa)).count(); } @@ -190,7 +190,7 @@ pub fn proc_lvar_auto( let cdr1 = ex.share[j].cdr1_start.unwrap(); let fwr2 = ex.share[j].fr2_start.unwrap(); if cdr1 < fwr2 { - let aa = aa_seq(&ex.share[j].seq[cdr1..fwr2], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr1..fwr2], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -198,13 +198,13 @@ pub fn proc_lvar_auto( let cdr2 = ex.share[j].cdr2_start.unwrap(); let fwr3 = ex.share[j].fr3_start.unwrap(); if cdr2 < fwr3 { - let aa = aa_seq(&ex.share[j].seq[cdr2..fwr3], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr2..fwr3], 0); n += reg.find_iter(strme(&aa)).count(); } } let cdr3 = ex.share[j].cdr3_start; let fwr4 = cdr3 + 3 * ex.share[j].cdr3_aa.len(); - let aa = aa_seq(&ex.share[j].seq[cdr3..fwr4], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr3..fwr4], 0); n += reg.find_iter(strme(&aa)).count(); } @@ -225,7 +225,7 @@ pub fn proc_lvar_auto( let cdr1 = ex.share[j].cdr1_start.unwrap(); let fwr2 = ex.share[j].fr2_start.unwrap(); if cdr1 < fwr2 { - let aa = aa_seq(&ex.share[j].seq[cdr1..fwr2], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr1..fwr2], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -233,13 +233,13 @@ pub fn proc_lvar_auto( let cdr2 = ex.share[j].cdr2_start.unwrap(); let fwr3 = ex.share[j].fr3_start.unwrap(); if cdr2 < fwr3 { - let aa = aa_seq(&ex.share[j].seq[cdr2..fwr3], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr2..fwr3], 0); n += reg.find_iter(strme(&aa)).count(); } } let cdr3 = ex.share[j].cdr3_start; let fwr4 = cdr3 + 3 * ex.share[j].cdr3_aa.len(); - let aa = aa_seq(&ex.share[j].seq[cdr3..fwr4], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr3..fwr4], 0); n += reg.find_iter(strme(&aa)).count(); } @@ -267,7 +267,7 @@ pub fn proc_lvar_auto( let cdr1 = ex.share[j].cdr1_start.unwrap(); let fwr2 = ex.share[j].fr2_start.unwrap(); if cdr1 < fwr2 { - let aa = aa_seq(&ex.share[j].seq[cdr1..fwr2], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr1..fwr2], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -278,7 +278,7 @@ pub fn proc_lvar_auto( let cdr2 = ex.share[j].cdr2_start.unwrap(); let fwr3 = ex.share[j].fr3_start.unwrap(); if cdr2 < fwr3 { - let aa = aa_seq(&ex.share[j].seq[cdr2..fwr3], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr2..fwr3], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -287,7 +287,7 @@ pub fn proc_lvar_auto( for j in 0..ex.share.len() { let cdr3 = ex.share[j].cdr3_start; let fwr4 = cdr3 + 3 * ex.share[j].cdr3_aa.len(); - let aa = aa_seq(&ex.share[j].seq[cdr3..fwr4], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr3..fwr4], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -318,7 +318,7 @@ pub fn proc_lvar_auto( let cdr1 = ex.share[j].cdr1_start.unwrap(); let fwr2 = ex.share[j].fr2_start.unwrap(); if cdr1 < fwr2 { - let aa = aa_seq(&ex.share[j].seq[cdr1..fwr2], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr1..fwr2], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -329,7 +329,7 @@ pub fn proc_lvar_auto( let cdr2 = ex.share[j].cdr2_start.unwrap(); let fwr3 = ex.share[j].fr3_start.unwrap(); if cdr2 < fwr3 { - let aa = aa_seq(&ex.share[j].seq[cdr2..fwr3], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr2..fwr3], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -338,7 +338,7 @@ pub fn proc_lvar_auto( for j in 0..ex.share.len() { let cdr3 = ex.share[j].cdr3_start; let fwr4 = cdr3 + 3 * ex.share[j].cdr3_aa.len(); - let aa = aa_seq(&ex.share[j].seq[cdr3..fwr4], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[cdr3..fwr4], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -361,7 +361,7 @@ pub fn proc_lvar_auto( let fwr1 = ex.share[j].fr1_start; let cdr1 = ex.share[j].cdr1_start.unwrap(); if fwr1 < cdr1 { - let aa = aa_seq(&ex.share[j].seq[fwr1..cdr1], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr1..cdr1], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -369,7 +369,7 @@ pub fn proc_lvar_auto( let fwr2 = ex.share[j].fr2_start.unwrap(); let cdr2 = ex.share[j].cdr2_start.unwrap(); if fwr2 < cdr2 { - let aa = aa_seq(&ex.share[j].seq[fwr2..cdr2], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr2..cdr2], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -377,12 +377,12 @@ pub fn proc_lvar_auto( let fwr3 = ex.share[j].fr3_start.unwrap(); let cdr3 = ex.share[j].cdr3_start; if fwr3 < cdr3 { - let aa = aa_seq(&ex.share[j].seq[fwr3..cdr3], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr3..cdr3], 0); n += reg.find_iter(strme(&aa)).count(); } } let fwr4 = ex.share[j].cdr3_start + 3 * ex.share[j].cdr3_aa.len(); - let aa = aa_seq(&ex.share[j].seq[fwr4..], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr4..], 0); n += reg.find_iter(strme(&aa)).count(); } @@ -403,7 +403,7 @@ pub fn proc_lvar_auto( let fwr1 = ex.share[j].fr1_start; let cdr1 = ex.share[j].cdr1_start.unwrap(); if fwr1 < cdr1 { - let aa = aa_seq(&ex.share[j].seq[fwr1..cdr1], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr1..cdr1], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -411,7 +411,7 @@ pub fn proc_lvar_auto( let fwr2 = ex.share[j].fr2_start.unwrap(); let cdr2 = ex.share[j].cdr2_start.unwrap(); if fwr2 < cdr2 { - let aa = aa_seq(&ex.share[j].seq[fwr2..cdr2], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr2..cdr2], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -419,12 +419,12 @@ pub fn proc_lvar_auto( let fwr3 = ex.share[j].fr3_start.unwrap(); let cdr3 = ex.share[j].cdr3_start; if fwr3 < cdr3 { - let aa = aa_seq(&ex.share[j].seq[fwr3..cdr3], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr3..cdr3], 0); n += reg.find_iter(strme(&aa)).count(); } } let fwr4 = ex.share[j].cdr3_start + 3 * ex.share[j].cdr3_aa.len(); - let aa = aa_seq(&ex.share[j].seq[fwr4..], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr4..], 0); n += reg.find_iter(strme(&aa)).count(); } @@ -452,7 +452,7 @@ pub fn proc_lvar_auto( let fwr1 = ex.share[j].fr1_start; let cdr1 = ex.share[j].cdr1_start.unwrap(); if fwr1 < cdr1 { - let aa = aa_seq(&ex.share[j].seq[fwr1..cdr1], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr1..cdr1], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -463,7 +463,7 @@ pub fn proc_lvar_auto( let fwr2 = ex.share[j].fr2_start.unwrap(); let cdr2 = ex.share[j].cdr2_start.unwrap(); if fwr2 < cdr2 { - let aa = aa_seq(&ex.share[j].seq[fwr2..cdr2], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr2..cdr2], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -474,7 +474,7 @@ pub fn proc_lvar_auto( let fwr3 = ex.share[j].fr3_start.unwrap(); let cdr3 = ex.share[j].cdr3_start; if fwr3 < cdr3 { - let aa = aa_seq(&ex.share[j].seq[fwr3..cdr3], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr3..cdr3], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -482,7 +482,7 @@ pub fn proc_lvar_auto( } else { for j in 0..ex.share.len() { let fwr4 = ex.share[j].cdr3_start + 3 * ex.share[j].cdr3_aa.len(); - let aa = aa_seq(&ex.share[j].seq[fwr4..], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr4..], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -513,7 +513,7 @@ pub fn proc_lvar_auto( let fwr1 = ex.share[j].fr1_start; let cdr1 = ex.share[j].cdr1_start.unwrap(); if fwr1 < cdr1 { - let aa = aa_seq(&ex.share[j].seq[fwr1..cdr1], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr1..cdr1], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -524,7 +524,7 @@ pub fn proc_lvar_auto( let fwr2 = ex.share[j].fr2_start.unwrap(); let cdr2 = ex.share[j].cdr2_start.unwrap(); if fwr2 < cdr2 { - let aa = aa_seq(&ex.share[j].seq[fwr2..cdr2], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr2..cdr2], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -535,7 +535,7 @@ pub fn proc_lvar_auto( let fwr3 = ex.share[j].fr3_start.unwrap(); let cdr3 = ex.share[j].cdr3_start; if fwr3 < cdr3 { - let aa = aa_seq(&ex.share[j].seq[fwr3..cdr3], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr3..cdr3], 0); n += reg.find_iter(strme(&aa)).count(); } } @@ -543,7 +543,7 @@ pub fn proc_lvar_auto( } else { for j in 0..ex.share.len() { let fwr4 = ex.share[j].cdr3_start + 3 * ex.share[j].cdr3_aa.len(); - let aa = aa_seq(&ex.share[j].seq[fwr4..], 0); + let aa = nucleotide_to_aminoacid_sequence(&ex.share[j].seq[fwr4..], 0); n += reg.find_iter(strme(&aa)).count(); } } diff --git a/enclone_stuff/src/flag_defective.rs b/enclone_stuff/src/flag_defective.rs index 78b5ff066..657b6b2f5 100644 --- a/enclone_stuff/src/flag_defective.rs +++ b/enclone_stuff/src/flag_defective.rs @@ -2,7 +2,7 @@ // Flag defective reference sequences. -use amino::aa_seq; +use amino::nucleotide_to_aminoacid_sequence; use enclone_core::defs::EncloneControl; use io_utils::fwriteln; use itertools::Itertools; @@ -93,8 +93,8 @@ pub fn flag_defective( // Continue. let seq = refs.to_ascii_vec(); - let aa0 = aa_seq(&seq, 0); - let aa2 = aa_seq(&seq, 2); + let aa0 = nucleotide_to_aminoacid_sequence(&seq, 0); + let aa2 = nucleotide_to_aminoacid_sequence(&seq, 2); if aa2.contains(&b'*') && !aa0.contains(&b'*') { count += 1; *broken = true; @@ -142,7 +142,7 @@ pub fn flag_defective( // Test for broken. let seq = refs.to_ascii_vec(); - let aa = aa_seq(&seq, 0); + let aa = nucleotide_to_aminoacid_sequence(&seq, 0); let mut reasons = Vec::<&'static str>::new(); if !aa.starts_with(b"M") { reasons.push("does not begin with a start codon"); @@ -171,7 +171,7 @@ pub fn flag_defective( let mut seqx = seq.clone(); for _ in 1..=2 { let _ = seqx.remove(3 * j); - let aax = aa_seq(&seqx, 0); + let aax = nucleotide_to_aminoacid_sequence(&seqx, 0); if !aax.contains(&b'*') { fixable = true; } @@ -184,7 +184,7 @@ pub fn flag_defective( } if aa.len() >= 31 { for del in 1..=2 { - let aad = aa_seq(&seq, del); + let aad = nucleotide_to_aminoacid_sequence(&seq, del); if cdr3_score(&aad, chain_type, false) > 4 + cdr3_score(&aa, chain_type, false) { reasons.push("appears to be frameshifted"); @@ -208,7 +208,7 @@ pub fn flag_defective( let score = cdr3_score(&aa, chain_type, false); let mut frameshift = false; for del in 1..=2 { - let aad = aa_seq(&seq, del); + let aad = nucleotide_to_aminoacid_sequence(&seq, del); if score <= 6 && cdr3_score(&aad, chain_type, false) >= 3 + score { frameshift = true; } diff --git a/enclone_stuff/src/populate_features.rs b/enclone_stuff/src/populate_features.rs index 755c06bf2..4a2176fda 100644 --- a/enclone_stuff/src/populate_features.rs +++ b/enclone_stuff/src/populate_features.rs @@ -2,7 +2,7 @@ // Populate features. -use amino::aa_seq; +use amino::nucleotide_to_aminoacid_sequence; use enclone_core::defs::EncloneControl; use io_utils::fwriteln; use std::fmt::Write as _; @@ -34,7 +34,7 @@ pub fn populate_features( if broken[i] && ctl.gen_opt.require_unbroken_ok { continue; } - let aa = aa_seq(&refdata.refs[i].to_ascii_vec(), 0); + let aa = nucleotide_to_aminoacid_sequence(&refdata.refs[i].to_ascii_vec(), 0); let rtype = refdata.rtype[i]; let chain_type = if rtype == 0 { "IGH" diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index bccc4de59..ed0587a47 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -6,7 +6,7 @@ use crate::transcript::is_productive_contig; use crate::{refx::RefData, transcript::ContigStatus}; use align_tools::affine_align; -use amino::{aa_seq, have_start}; +use amino::{have_start, nucleotide_to_aminoacid_sequence}; use bio_edit::alignment::AlignmentOperation::{Del, Ins, Match, Subst, Xclip, Yclip}; use debruijn::{ dna_string::{DnaString, DnaStringSlice}, @@ -2654,72 +2654,115 @@ pub fn cdr3_motif_right() -> Vec> { vec![b"LTFG.GTRVTV".to_vec(), b"LIWG.GSKLSI".to_vec()] } -pub fn cdr3_min_len() -> usize { - 5 +const CDR3_MIN_LEN: usize = 5; +const LEFT_FLANK_MIN_SCORE: usize = 3; +const RIGHT_FLANK_MIN_SCORE: usize = 4; + +pub struct CDR3Annotation { + pub start_position_on_contig: usize, + pub aa_seq: Vec, + left_flank_score: usize, + right_flank_score: usize, } -pub fn get_cdr3(tig: &DnaStringSlice, cdr3: &mut Vec<(usize, Vec, usize, usize)>) { +pub fn get_cdr3(contig: &DnaStringSlice) -> Vec { const MIN_TOTAL_CDR3_SCORE: usize = 10; // about as high as one can go - let (left, right) = (cdr3_motif_left(), cdr3_motif_right()); - cdr3.clear(); - let x = tig.to_owned().to_ascii_vec(); - for i in 0..3 { - // go through three frames - let a = aa_seq(&x, i); - if a.len() < 4 { - return; - } - for j in 0..a.len() - min(a.len(), (cdr3_min_len() + 3) + 1) { - if a[j] == b'C' { - // CDR3 starts at position j on a - let first_f = j + (cdr3_min_len() - 3); - let last_f = a.len() - 4; - for k in first_f..last_f { - if k + right[0].len() > a.len() { + + let left_motifs = cdr3_motif_left(); + let right_motifs = cdr3_motif_right(); + + let contig_seq = contig.to_owned().to_ascii_vec(); + + let mut found_cdr3s: Vec = Vec::new(); + for frame_idx in 0..3 { + // Go through three frames. + + // Convert the DNA sequence + frame to an amino acid sequence. + let amino_acid_seq = nucleotide_to_aminoacid_sequence(&contig_seq, frame_idx); + if amino_acid_seq.len() < 4 { + continue; + } + + // Check each position in the AA seq to see if we find a CDR3 there. + for cdr3_start_pos in + 0..amino_acid_seq.len() - min(amino_acid_seq.len(), (CDR3_MIN_LEN + 3) + 1) + { + // The CDR3 has to start with a Cysteine. + if amino_acid_seq[cdr3_start_pos] == b'C' { + // Search for the right flank set up start and end positions for the search. + let first_f = cdr3_start_pos + (CDR3_MIN_LEN - 3); + let last_f = amino_acid_seq.len() - RIGHT_FLANK_MIN_SCORE; + for right_motif_start_pos in first_f..last_f { + // Don't look further if the remaining part of the amino_acid_seq is to short to find the full right flank motif. + if right_motif_start_pos + right_motifs[0].len() > amino_acid_seq.len() { break; } - let mut rscore = 0; - for m in 0..right[0].len() { + + // Match the right flank and calculate the score. + let mut right_flank_score = 0; + for right_motif_col in 0..right_motifs[0].len() { let mut hit = false; - for r in 0..right.len() { - if a[k + m] == right[r][m] { + for right_motif_row in &right_motifs { + if amino_acid_seq[right_motif_start_pos + right_motif_col] + == right_motif_row[right_motif_col] + { hit = true; } } if hit { - rscore += 1; + right_flank_score += 1; } } - if rscore >= 4 { - let mut st = false; - for l in j + 1..k + 2 { - if a[l] == b'*' { - st = true; + + // If right flank score is larger than RIGHT_FLANK_MIN_SCORE, continue and attempt to match left flank, + // otherwise continue with next possible CDR3 start position. + if right_flank_score >= RIGHT_FLANK_MIN_SCORE { + // Check if there is a stop codon in the CDR3. + let mut stop_codon = false; + for aa in amino_acid_seq + .iter() + .take(right_motif_start_pos + 2) + .skip(cdr3_start_pos + 1) + { + if aa == &b'*' { + stop_codon = true; } } - let ll = left[0].len(); - if !st && j >= ll { - let mut lscore = 0; - for m in 0..ll { + + // If there is no stop codon and there is room for the full left motif in the AA seq, match left flank. + let ll = left_motifs[0].len(); + if !stop_codon && cdr3_start_pos >= ll { + let mut left_flank_score = 0; + for left_motif_col in 0..ll { let mut hit = false; - for r in 0..left.len() { - if a[j - ll + m] == left[r][m] { + for left_motif_row in &left_motifs { + if amino_acid_seq[cdr3_start_pos - ll + left_motif_col] + == left_motif_row[left_motif_col] + { hit = true; } } if hit { - lscore += 1; + left_flank_score += 1; } } - // ◼ It's possible that the lscore + rscore + + // If the left flank score and total score is above cutoff push the CDR3 to the vec. + // ◼ It's possible that the left_flank_score + right_flank_score // ◼ bound should be increased. - if lscore >= 3 && lscore + rscore >= MIN_TOTAL_CDR3_SCORE { - cdr3.push(( - tig.start + i + 3 * j, - a[j..k + 2 + 1].to_vec(), - lscore, - rscore, - )); + if left_flank_score >= LEFT_FLANK_MIN_SCORE + && left_flank_score + right_flank_score >= MIN_TOTAL_CDR3_SCORE + { + found_cdr3s.push(CDR3Annotation { + start_position_on_contig: contig.start + + frame_idx + + 3 * cdr3_start_pos, + aa_seq: amino_acid_seq + [cdr3_start_pos..right_motif_start_pos + 2 + 1] + .to_vec(), + left_flank_score: left_flank_score, + right_flank_score: right_flank_score, + }); } } } @@ -2729,33 +2772,39 @@ pub fn get_cdr3(tig: &DnaStringSlice, cdr3: &mut Vec<(usize, Vec, usize, usi } // Only return cdr3s having the maximum score. - - let m = cdr3.iter().map(|cdi| cdi.2 + cdi.3).max().unwrap_or(0); - let to_delete = cdr3.iter().map(|cdi| cdi.2 + cdi.3 < m).collect::>(); - erase_if(cdr3, &to_delete); - cdr3.sort(); + let max_score = found_cdr3s + .iter() + .map(|cdr3| cdr3.left_flank_score + cdr3.right_flank_score) + .max() + .unwrap_or(0); + let to_delete = found_cdr3s + .iter() + .map(|cdr3| cdr3.left_flank_score + cdr3.right_flank_score < max_score) + .collect::>(); + erase_if(&mut found_cdr3s, &to_delete); + found_cdr3s.sort_by_key(|cdr3| cdr3.start_position_on_contig); // Prefer later start and prefer longer CDR3. - - if cdr3.len() > 1 { + if found_cdr3s.len() > 1 { // ◼ This is awkward. - let n = cdr3.len(); - cdr3.swap(0, n - 1); - cdr3.truncate(1); - } + let n = found_cdr3s.len(); + found_cdr3s.swap(0, n - 1); + found_cdr3s.truncate(1); + }; + + found_cdr3s } pub fn print_cdr3(tig: &DnaStringSlice, log: &mut Vec) { - let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - get_cdr3(tig, &mut cdr3); - for i in 0..cdr3.len() { + let cdr3_anns = get_cdr3(tig); + for cdr3 in cdr3_anns { fwriteln!( log, "cdr3 = {} at {}, score = {} + {}", - strme(&cdr3[i].1), - cdr3[i].0, - cdr3[i].2, - cdr3[i].3 + strme(&cdr3.aa_seq), + cdr3.start_position_on_contig, + cdr3.left_flank_score, + cdr3.right_flank_score ); } } @@ -2769,7 +2818,7 @@ pub fn print_cdr3(tig: &DnaStringSlice, log: &mut Vec) { // ◼ of the V segment is not right. pub fn cdr3_loc<'a>( - tig: &'a DnaString, + contig: &'a DnaString, refdata: &RefData, ann: &[Annotation], ) -> DnaStringSlice<'a> { @@ -2777,25 +2826,25 @@ pub fn cdr3_loc<'a>( // except possibly for changes less than ten. const LOW_RELV_CDR3: isize = -40; if ann.is_empty() { - return tig.slice(0, 0); + return contig.slice(0, 0); } let mut i = ann.len() - 1; loop { let t = ann[i].ref_id as usize; if !refdata.rheaders[t].contains("segment") && refdata.is_v(t) { let (l, p) = (ann[i].tig_start as isize, ann[i].ref_start as isize); - let vstop_on_tig = l + refdata.refs[t].len() as isize - p; - let mut start = vstop_on_tig + LOW_RELV_CDR3; + let vstop_on_contig = l + refdata.refs[t].len() as isize - p; + let mut start = vstop_on_contig + LOW_RELV_CDR3; if start < 0 { start = 0; } - if start > tig.len() as isize { - start = tig.len() as isize; + if start > contig.len() as isize { + start = contig.len() as isize; } - return tig.slice(start as usize, tig.len()); + return contig.slice(start as usize, contig.len()); } if i == 0 { - return tig.slice(0, 0); + return contig.slice(0, 0); } i -= 1; } @@ -2809,8 +2858,7 @@ pub fn get_cdr3_using_ann( tig: &DnaString, refdata: &RefData, ann: &[Annotation], - cdr3: &mut Vec<(usize, Vec, usize, usize)>, -) { +) -> Vec { let window = cdr3_loc(tig, refdata, ann); // Enlarge the window because get_cdr3 looks for motifs to the left and right @@ -2828,8 +2876,7 @@ pub fn get_cdr3_using_ann( let window2 = tig.slice(start as usize, stop as usize); // Now find the CDR3. - - get_cdr3(&window2, cdr3) + get_cdr3(&window2) } pub fn print_cdr3_using_ann( @@ -2838,16 +2885,15 @@ pub fn print_cdr3_using_ann( ann: &[Annotation], log: &mut Vec, ) { - let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - get_cdr3_using_ann(tig, refdata, ann, &mut cdr3); - for i in 0..cdr3.len() { + let found_cdr3s = get_cdr3_using_ann(tig, refdata, ann); + for cdr3 in found_cdr3s { fwriteln!( log, "cdr3 = {} at {}, score = {} + {}", - strme(&cdr3[i].1), - cdr3[i].0, - cdr3[i].2, - cdr3[i].3 + strme(&cdr3.aa_seq), + cdr3.start_position_on_contig, + cdr3.left_flank_score, + cdr3.right_flank_score ); } } @@ -3150,7 +3196,7 @@ impl ContigAnnotation { let mut stop = -1_i32; let x = b.to_owned().to_ascii_vec(); if vstart >= 0 { - let y = aa_seq(&x, vstart as usize); + let y = nucleotide_to_aminoacid_sequence(&x, vstart as usize); aa = stringme(&y); for i in 0..y.len() { if y[i] == b'*' { @@ -3161,15 +3207,15 @@ impl ContigAnnotation { } let (mut cdr3x, mut cdr3x_dna) = (String::new(), String::new()); let (mut cdr3x_start, mut cdr3x_stop) = (-1_i32, -1_i32); - let mut cdr3y = Vec::<(usize, Vec, usize, usize)>::new(); - if !refdata.refs.is_empty() { - get_cdr3_using_ann(b, refdata, ann, &mut cdr3y); + let found_cdr3s = if !refdata.refs.is_empty() { + get_cdr3_using_ann(b, refdata, ann) } else { - get_cdr3(&b.slice(0, b.len()), &mut cdr3y); - } - if !cdr3y.is_empty() { - cdr3x = stringme(&cdr3y[0].1); - let start = cdr3y[0].0; + get_cdr3(&b.slice(0, b.len())) + }; + if !found_cdr3s.is_empty() { + let cdr3 = found_cdr3s.first().unwrap(); + cdr3x = stringme(&cdr3.aa_seq); + let start = cdr3.start_position_on_contig; for i in start..start + 3 * cdr3x.len() { cdr3x_dna.push(x[i] as char); } diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 2d29d3f57..36e73c231 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -170,13 +170,12 @@ fn evaluate_contig_status( ) }; - let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - get_cdr3_using_ann(contig, reference, ann, &mut cdr3); - contig_status.has_cdr3 = Some(!cdr3.is_empty()); + let found_cdr3s = get_cdr3_using_ann(contig, reference, ann); + contig_status.has_cdr3 = Some(!found_cdr3s.is_empty()); - if let (Some((vstart, jstop)), Some(cdr3)) = (inframe_pair, cdr3.first()) { + if let (Some((vstart, jstop)), Some(cdr3)) = (inframe_pair, found_cdr3s.first()) { let expected_len = (refs[vstart.ref_id].len() + refs[jstop.ref_id].len()) as i32 - + (3 * cdr3.1.len() as i32) + + (3 * cdr3.aa_seq.len() as i32) - 20; let observed_len = jstop.tig_stop as i32 - vstart.tig_start as i32; let delta = expected_len - observed_len;