diff --git a/enclone_args/src/read_json.rs b/enclone_args/src/read_json.rs index c08ebec52..f627260ae 100644 --- a/enclone_args/src/read_json.rs +++ b/enclone_args/src/read_json.rs @@ -2,7 +2,7 @@ use self::annotate::{annotate_seq, get_cdr3_using_ann, print_some_annotations}; use self::refx::RefData; -use self::transcript::is_valid; +use self::transcript::is_productive_contig; use debruijn::dna_string::DnaString; use enclone_core::barcode_fate::BarcodeFate; use enclone_core::defs::{EncloneControl, OriginInfo, TigData}; @@ -183,28 +183,12 @@ fn process_json_annotation( print_some_annotations(refdata, &ann1, &mut log, false); print!("\n{}", strme(&log)); } - let mut log = Vec::::new(); if ctl.gen_opt.trace_barcode == ann.barcode { - if !is_valid( - &x, - refdata, - &ann1, - true, - &mut log, - Some(ctl.gen_opt.gamma_delta), - ) { - print!("{}", strme(&log)); + if !is_productive_contig(&x, refdata, &ann1).0 { println!("invalid"); return Ok(res); } - } else if !is_valid( - &x, - refdata, - &ann1, - false, - &mut log, - Some(ctl.gen_opt.gamma_delta), - ) { + } else if !is_productive_contig(&x, refdata, &ann1).0 { return Ok(res); } let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); diff --git a/vdj_ann/src/annotate.rs b/vdj_ann/src/annotate.rs index adc59d263..bbfab86df 100644 --- a/vdj_ann/src/annotate.rs +++ b/vdj_ann/src/annotate.rs @@ -3,8 +3,8 @@ // This file contains code to annotate a contig, in the sense of finding alignments // to VDJ reference contigs. Also to find CDR3 sequences. And some related things. -use crate::refx::RefData; -use crate::transcript::is_valid; +use crate::transcript::is_productive_contig; +use crate::{refx::RefData, transcript::ContigStatus}; use align_tools::affine_align; use amino::{aa_seq, have_start}; use bio_edit::alignment::AlignmentOperation::{Del, Ins, Match, Subst, Xclip, Yclip}; @@ -3022,6 +3022,8 @@ pub struct ContigAnnotation { pub productive: Option, // productive? (null means not full length) #[serde(default = "set_true")] pub filtered: bool, // true and never changed (unused field) + /// criteria used to asess productive status + pub productive_criteria: Option, pub is_gex_cell: Option, // Was the barcode declared a cell by Gene expression data, if available pub is_asm_cell: Option, // Was the barcode declared a cell by the VDJ assembler @@ -3057,6 +3059,7 @@ impl ContigAnnotation { invalidated_umis: Option>, // invalidated UMIs is_cellx: bool, // was the barcode declared a cell? productivex: bool, // productive? + productive_criteria: ContigStatus, // criteria used to asess productive status jsupp: Option, // num reads, umis supporting junction ) -> ContigAnnotation { let mut vstart = -1_i32; @@ -3146,6 +3149,7 @@ impl ContigAnnotation { invalidated_umis, is_cell: is_cellx, productive: Some(productivex), + productive_criteria: Some(productive_criteria), filtered: true, junction_support: jsupp, // These need to be populated by the assembler explicitly as needed @@ -3180,13 +3184,11 @@ impl ContigAnnotation { non_validated_umis: Option>, // non-validated UMIs invalidated_umis: Option>, // invalidated UMIs is_cell: bool, // was the barcode declared a cell? - is_gd: Option, // is gamma/delta mode jsupp: Option, // num reads, umis supporting junction ) -> ContigAnnotation { let mut ann = Vec::<(i32, i32, i32, i32, i32)>::new(); annotate_seq(b, refdata, &mut ann, true, false, true); - let mut log = Vec::::new(); - let productive = is_valid(b, refdata, &ann, false, &mut log, is_gd); + let (is_productive, productive_criteria) = is_productive_contig(b, refdata, &ann); ContigAnnotation::from_annotate_seq( b, q, @@ -3200,7 +3202,8 @@ impl ContigAnnotation { non_validated_umis, invalidated_umis, is_cell, - productive, + is_productive, + productive_criteria, jsupp, ) } @@ -3358,7 +3361,6 @@ mod tests { None, false, // is_cell, should be changed to None None, - None, ); // println!("{:#?}", annotation); diff --git a/vdj_ann/src/transcript.rs b/vdj_ann/src/transcript.rs index 2fa9f708b..b8a0eea6b 100644 --- a/vdj_ann/src/transcript.rs +++ b/vdj_ann/src/transcript.rs @@ -7,187 +7,243 @@ use crate::refx::RefData; use amino::{have_start, have_stop}; use debruijn::{dna_string::DnaString, kmer::Kmer20, Mer, Vmer}; use hyperbase::Hyper; -use io_utils::fwriteln; +use itertools::iproduct; use kmer_lookup::make_kmer_lookup_20_single; -use std::{cmp::max, io::prelude::*}; +use serde::{Deserialize, Serialize}; +use std::cmp::max; +use std::str::FromStr; +use vdj_types::{VdjChain, VDJ_CHAINS}; use vector_utils::{lower_bound1_3, unique_sort}; +const MIN_DELTA: i32 = -25; +const MIN_DELTA_IGH: i32 = -55; +const MAX_DELTA: i32 = 35; + // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ // TEST FOR VALID VDJ SEQUENCE // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ -pub fn is_valid( - b: &DnaString, - refdata: &RefData, - ann: &[(i32, i32, i32, i32, i32)], - logme: bool, - log: &mut Vec, - is_gd: Option, -) -> bool { - // Unwrap gamma/delta mode flag - let gd_mode = is_gd.unwrap_or(false); - let refs = &refdata.refs; - let rheaders = &refdata.rheaders; - for pass in 0..2 { - let mut m = "A"; - if pass == 1 { - m = "B"; - } - let mut vstarts = Vec::::new(); - let mut jstops = Vec::::new(); - let mut first_vstart: i32 = -1; - let mut first_vstart_len: i32 = -1; - let mut last_jstop: i32 = -1; - let mut last_jstop_len: i32 = -1; - let mut igh = false; - for j in 0..ann.len() { - let l = ann[j].0 as usize; - let len = ann[j].1 as usize; - let t = ann[j].2 as usize; - let p = ann[j].3 as usize; - if rheaders[t].contains("IGH") { - igh = true; - } - if !rheaders[t].contains("5'UTR") - && ((m == "A" - && (rheaders[t].contains("TRAV") - || (rheaders[t].contains("TRGV") && gd_mode) - || rheaders[t].contains("IGHV"))) - || (m == "B" - && (rheaders[t].contains("TRBV") - || (rheaders[t].contains("TRDV") && gd_mode) - || rheaders[t].contains("IGLV") - || rheaders[t].contains("IGKV")))) - { - if first_vstart < 0 { - first_vstart = l as i32; - first_vstart_len = (refs[t].len() - p) as i32; - } - if p == 0 { - vstarts.push(l as i32); - } - } - if (m == "A" - && (rheaders[t].contains("TRAJ") - || (rheaders[t].contains("TRGJ") && gd_mode) - || rheaders[t].contains("IGHJ"))) - || (m == "B" - && (rheaders[t].contains("TRBJ") - || (rheaders[t].contains("TRDJ") && gd_mode) - || rheaders[t].contains("IGLJ") - || rheaders[t].contains("IGKJ"))) - { - last_jstop = (l + len) as i32; - last_jstop_len = (p + len) as i32; - if p + len == refs[t].len() { - jstops.push((l + len) as i32); - } - } - } - unique_sort(&mut vstarts); - unique_sort(&mut jstops); - let mut full = false; - for pass in 1..3 { - if pass == 2 && full { - continue; - } - let mut msg = ""; - if pass == 2 { - msg = "frameshifted "; - }; - for start in vstarts.iter() { - if !have_start(b, *start as usize) { - continue; - } - for stop in jstops.iter() { - let n = stop - start; - if pass == 2 || n % 3 == 1 { - let mut stop_codon = false; - // shouldn't it be stop-3+1???????????????????????????????? - for j in (*start..stop - 3).step_by(3) { - if have_stop(b, j as usize) { - stop_codon = true; - } - } - if !stop_codon { - if pass == 1 { - full = true; - } - if logme { - fwriteln!( - log, - "{}full length transcript of length {}", - msg, - b.len() - ); - } - } else if logme { - fwriteln!( - log, - "{}full length stopped transcript of length {}", - msg, - b.len() - ); - } - } - } - } - } - let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); - get_cdr3_using_ann(b, refdata, ann, &mut cdr3); - if cdr3.is_empty() { - if logme { - fwriteln!(log, "did not find CDR3"); - } - return false; - } - let mut too_large = false; - const MIN_DELTA: i32 = -25; - const MIN_DELTA_IGH: i32 = -55; - const MAX_DELTA: i32 = 35; - if first_vstart >= 0 && last_jstop >= 0 { - let delta = (last_jstop_len + first_vstart_len + 3 * cdr3[0].1.len() as i32 - 20) - - (last_jstop - first_vstart); - if logme { - fwriteln!(log, "VJ delta = {}", delta); - } - let mut min_delta = MIN_DELTA; - if igh { - min_delta = MIN_DELTA_IGH; - } - if delta < min_delta || delta > MAX_DELTA { - too_large = true; - if logme { - fwriteln!(log, "delta too large"); - } - } - } - let mut misordered = false; - for j1 in 0..ann.len() { - let t1 = ann[j1].2 as usize; - for j2 in j1 + 1..ann.len() { - let t2 = ann[j2].2 as usize; - if (refdata.is_j(t1) && refdata.is_v(t2)) - || (refdata.is_j(t1) && refdata.is_u(t2)) - || (refdata.is_j(t1) && refdata.is_d(t2)) - || (refdata.is_v(t1) && refdata.is_u(t2)) - || (refdata.is_c(t1) && !refdata.is_c(t2)) - { - misordered = true; - } - } - } - if misordered && logme { - fwriteln!(log, "misordered"); +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Default)] +pub struct ContigStatus { + full_length: Option, + has_v_start: Option, + in_frame: Option, + no_premature_stop: Option, + has_cdr3: Option, + has_expected_size: Option, + correct_ann_order: Option, +} + +impl ContigStatus { + fn is_productive(&self) -> bool { + match ( + self.full_length, + self.has_v_start, + self.in_frame, + self.no_premature_stop, + self.has_cdr3, + self.has_expected_size, + self.correct_ann_order, + ) { + ( + Some(true), + Some(true), + Some(true), + Some(true), + Some(true), + Some(true), + Some(true), + ) => true, + (_, _, _, _, _, _, _) => false, } - if !full && logme { - fwriteln!(log, "not full"); + } + + fn order_by(&self) -> u16 { + match ( + self.full_length, + self.has_v_start, + self.in_frame, + self.no_premature_stop, + self.has_cdr3, + self.has_expected_size, + self.correct_ann_order, + ) { + ( + Some(true), + Some(true), + Some(true), + Some(true), + Some(true), + Some(true), + Some(true), + ) => 0, + (Some(_), Some(_), Some(_), Some(_), Some(_), Some(_), Some(_)) => 1, + (Some(_), Some(_), Some(_), Some(_), Some(_), Some(_), _) => 2, + (Some(_), Some(_), Some(_), Some(_), Some(_), _, _) => 3, + (Some(_), Some(_), Some(_), Some(_), _, _, _) => 4, + (Some(_), Some(_), Some(_), _, _, _, _) => 5, + (Some(_), Some(_), _, _, _, _, _) => 6, + (Some(_), _, _, _, _, _, _) => 7, + _ => 8, } - if full && !too_large && !misordered { - return true; + } +} + +#[derive(Clone, Copy)] +pub struct Vstart { + ref_id: usize, + tig_start: usize, +} + +#[derive(Clone, Copy)] +pub struct Jstop { + ref_id: usize, + tig_stop: usize, +} + +#[derive(Debug)] +struct Annotation { + tig_start: usize, + match_length: usize, + ref_id: usize, + ref_start: usize, +} + +fn find_inframe_vdj_pair(vstarts: Vec, jstops: Vec) -> Option<(Vstart, Jstop)> { + let mut vj_combinations: Vec<(Vstart, Jstop, i32)> = iproduct!(vstarts, jstops) + .map(|(v, j)| (v, j, j.tig_stop as i32 - v.tig_start as i32)) + .filter(|(_, _, n)| n > &0) + .filter(|(_, _, n)| n % 3 == 1) + .collect(); + vj_combinations.sort_by_key(|x| x.2); + let inframe_pair = vj_combinations.last().map(|(v, j, _)| (*v, *j)); + inframe_pair +} + +fn evaluate_contig_status( + vdj_chain: VdjChain, + ann: &[(i32, i32, i32, i32, i32)], + reference: &RefData, + contig: &DnaString, +) -> Option { + let valid_v_type = format!("{vdj_chain}V"); + let valid_j_type = format!("{vdj_chain}J"); + let rheaders = &reference.rheaders; + let refs = &reference.refs; + let annotation: Vec = ann + .iter() + .map( + |(tig_start, match_length, ref_id, ref_start, _)| Annotation { + tig_start: *tig_start as usize, + match_length: *match_length as usize, + ref_id: *ref_id as usize, + ref_start: *ref_start as usize, + }, + ) + .collect(); + let mut vstarts: Vec = annotation + .iter() + .filter(|a| reference.is_v(a.ref_id)) + .filter(|a| rheaders[a.ref_id].contains(&valid_v_type)) + .filter(|a| a.ref_start == 0) + .map(|a| Vstart { + ref_id: a.ref_id, + tig_start: a.tig_start, + }) + .collect(); + let jstops: Vec = annotation + .iter() + .filter(|a| reference.is_j(a.ref_id)) + .filter(|a| rheaders[a.ref_id].contains(&valid_j_type)) + .filter(|a| a.ref_start + a.match_length == refs[a.ref_id].len()) + .map(|a| Jstop { + ref_id: a.ref_id, + tig_stop: a.tig_start + a.match_length, + }) + .collect(); + + if vstarts.is_empty() & jstops.is_empty() { + return None; + } + + let mut contig_status = ContigStatus { + full_length: Some(!vstarts.is_empty() & !jstops.is_empty()), + ..Default::default() + }; + + // filter vstarts to require START codon + vstarts.retain(|v| have_start(contig, v.tig_start)); + contig_status.has_v_start = match (contig_status.full_length, vstarts.is_empty()) { + (Some(true), false) => Some(true), + (_, _) => Some(false), + }; + + let inframe_pair = find_inframe_vdj_pair(vstarts, jstops); + contig_status.in_frame = Some(inframe_pair.is_some()); + + if let Some((vstart, jstop)) = inframe_pair { + contig_status.no_premature_stop = Some( + !(vstart.tig_start..jstop.tig_stop - 3) + .step_by(3) + .any(|j| have_stop(contig, j)), + ) + }; + + let mut cdr3 = Vec::<(usize, Vec, usize, usize)>::new(); + get_cdr3_using_ann(contig, reference, ann, &mut cdr3); + contig_status.has_cdr3 = Some(!cdr3.is_empty()); + + if let (Some((vstart, jstop)), Some(cdr3)) = (inframe_pair, cdr3.first()) { + let expected_len = (refs[vstart.ref_id].len() + refs[jstop.ref_id].len()) as i32 + + (3 * cdr3.1.len() as i32) + - 20; + let observed_len = jstop.tig_stop as i32 - vstart.tig_start as i32; + let delta = expected_len - observed_len; + let min_delta = if vdj_chain == VdjChain::IGH { + MIN_DELTA_IGH + } else { + MIN_DELTA + }; + contig_status.has_expected_size = if delta < min_delta || delta > MAX_DELTA { + Some(false) + } else { + Some(true) } + }; + + let observed_order: Vec = annotation + .into_iter() + .map(|a| reference.segtype[a.ref_id]) + .map(|s| match s { + "U" => 0, + "V" => 1, + "D" => 2, + "J" => 3, + "C" => 4, + _ => panic!("Invalid segtype"), + }) + .collect(); + let mut expected_order = observed_order.clone(); + expected_order.sort(); + contig_status.correct_ann_order = Some(observed_order == expected_order); + + Some(contig_status) +} + +pub fn is_productive_contig( + b: &DnaString, + refdata: &RefData, + ann: &[(i32, i32, i32, i32, i32)], +) -> (bool, ContigStatus) { + let contig_status = VDJ_CHAINS + .iter() + .map(|chain| VdjChain::from_str(chain).unwrap()) + .filter_map(|chain| evaluate_contig_status(chain, ann, refdata, b)) + .min_by_key(ContigStatus::order_by); + if let Some(cs) = contig_status { + return (cs.is_productive(), cs.clone()); } - false + (false, ContigStatus::default()) } // ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓