Skip to content

Commit

Permalink
refactor: clarify CDR3 annotation algorithms (#399)
Browse files Browse the repository at this point in the history
  • Loading branch information
10xerik authored Mar 15, 2024
1 parent 337733b commit 3a2f1e6
Show file tree
Hide file tree
Showing 12 changed files with 231 additions and 176 deletions.
10 changes: 5 additions & 5 deletions amino/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,14 @@ pub fn codon_to_aa(codon: &[u8]) -> u8 {

// Convert a given DNA sequence to amino acids, starting at a given position.

pub fn aa_seq(x: &[u8], start: usize) -> Vec<u8> {
pub fn nucleotide_to_aminoacid_sequence(dna_seq: &[u8], start: usize) -> Vec<u8> {
let mut a = Vec::<u8>::new();
if x.len() >= 3 {
for j in (start..x.len() - 3 + 1).step_by(3) {
if x[j] == b'-' && x[j + 1] == b'-' && x[j + 2] == b'-' {
if dna_seq.len() >= 3 {
for pos in (start..dna_seq.len() - 3 + 1).step_by(3) {
if dna_seq[pos] == b'-' && dna_seq[pos + 1] == b'-' && dna_seq[pos + 2] == b'-' {
a.push(b'-');
} else {
a.push(codon_to_aa(&x[j..j + 3]));
a.push(codon_to_aa(&dna_seq[pos..pos + 3]));
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions enclone/src/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use enclone_core::barcode_fate::BarcodeFate;
use vdj_ann::refx;

use self::refx::RefData;
use amino::{aa_seq, codon_to_aa};
use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence};
use ansi_escape::emit_end_escape;
use debruijn::{dna_string::DnaString, Mer};
use enclone_core::defs::{CloneInfo, EncloneControl, ExactClonotype};
Expand Down Expand Up @@ -130,8 +130,8 @@ pub fn build_info(

// Optimize to compute entry in aa_mod_indel and the inserted aa sequence.

let aa_full = aa_seq(&x.seq, 0);
let ref_aa = aa_seq(&refdata.refs[vid].to_ascii_vec(), 0);
let aa_full = nucleotide_to_aminoacid_sequence(&x.seq, 0);
let ref_aa = nucleotide_to_aminoacid_sequence(&refdata.refs[vid].to_ascii_vec(), 0);
let ins_len_aa = ins_len / 3;
const EXT: usize = 10;
let ins_pos_low = if ins_pos / 3 < EXT {
Expand Down Expand Up @@ -180,7 +180,7 @@ pub fn build_info(
lens.push(x.seq.len());
tigs.push(x.seq.clone());
tigs_amino.push(x.seq.clone());
aa_mod_indel.push(aa_seq(&x.seq, 0));
aa_mod_indel.push(nucleotide_to_aminoacid_sequence(&x.seq, 0));
tigs_ins.push(Vec::new());
}

Expand Down
4 changes: 2 additions & 2 deletions enclone/src/misc2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

use crate::innate::mark_innate;
use crate::misc3::study_consensus;
use amino::aa_seq;
use amino::nucleotide_to_aminoacid_sequence;
use debruijn::dna_string::DnaString;
use enclone_core::barcode_fate::BarcodeFate;
use enclone_core::defs::{EncloneControl, ExactClonotype, Junction, TigData, TigData0, TigData1};
Expand Down Expand Up @@ -180,7 +180,7 @@ pub fn create_exact_subclonotype_core(
// are the same, which in principle they should be, but this is not actually always true.
// However this is hard to fix.

let aa = aa_seq(tig_bc[r][m].seq(), 0);
let aa = nucleotide_to_aminoacid_sequence(tig_bc[r][m].seq(), 0);
let mut d_start = None;
if tig_bc[r][m].d_start.is_some() {
d_start = Some(tig_bc[r][m].d_start.unwrap() + utr.len() - tig_bc[r][m].v_start);
Expand Down
20 changes: 10 additions & 10 deletions enclone_args/src/read_json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,10 @@ fn process_json_annotation(
} else if !is_productive_contig(&x, refdata, &ann1).0 {
return Ok(res);
}
let mut cdr3 = Vec::<(usize, Vec<u8>, usize, usize)>::new();
get_cdr3_using_ann(&x, refdata, &ann1, &mut cdr3);
cdr3_aa = stringme(&cdr3[0].1);
cdr3_start = cdr3[0].0;
let found_cdr3s = get_cdr3_using_ann(&x, refdata, &ann1);
let cdr3 = found_cdr3s.first().unwrap();
cdr3_aa = stringme(&cdr3.aa_seq);
cdr3_start = cdr3.start_position_on_contig;
cdr3_dna = x
.slice(cdr3_start, cdr3_start + 3 * cdr3_aa.len())
.to_string();
Expand Down Expand Up @@ -403,23 +403,23 @@ fn process_json_annotation(
// than is used in the current version of enclone. This could result in internal
// inconsistencies, leading to an assert somewhere downstream.

let mut cdr3 = Vec::<(usize, Vec<u8>, usize, usize)>::new();
let x = DnaString::from_dna_string(&ann.sequence);
get_cdr3_using_ann(&x, refdata, &annv, &mut cdr3);
if cdr3.is_empty() {
let found_cdr3s = get_cdr3_using_ann(&x, refdata, &annv);
if found_cdr3s.is_empty() {
return Ok(res);
}
let cdr3_aa_alt = stringme(&cdr3[0].1);
let cdr3 = found_cdr3s.first().unwrap();
let cdr3_aa_alt = stringme(&cdr3.aa_seq);
if cdr3_aa != cdr3_aa_alt {
// This is particularly pathological and rare:

if tig_start as usize > cdr3[0].0 {
if tig_start as usize > cdr3.start_position_on_contig {
return Ok(res);
}

// Define start.

cdr3_start = cdr3[0].0 - tig_start as usize;
cdr3_start = cdr3.start_position_on_contig - tig_start as usize;

// Define cdr3.

Expand Down
4 changes: 2 additions & 2 deletions enclone_core/src/mammalian_fixed_len.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Copyright (c) 2021 10X Genomics, Inc. All rights reserved.

use amino::aa_seq;
use amino::nucleotide_to_aminoacid_sequence;
use std::collections::HashMap;
use string_utils::TextUtils;
use superslice::Ext;
Expand Down Expand Up @@ -54,7 +54,7 @@ pub fn mammalian_fixed_len_peer_groups(refdata: &RefData) -> Vec<Vec<(usize, u8,
let mut pg = vec![Vec::<(usize, u8, u32)>::new(); refdata.refs.len()];
for (i, pgi) in pg.iter_mut().enumerate() {
if refdata.is_v(i) {
let aa = aa_seq(&refdata.refs[i].to_ascii_vec(), 0);
let aa = nucleotide_to_aminoacid_sequence(&refdata.refs[i].to_ascii_vec(), 0);
let rtype = refdata.rtype[i];
let chain_type = if rtype == 0 {
"IGH"
Expand Down
16 changes: 13 additions & 3 deletions enclone_print/src/print_utils2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::print_utils1::color_codon;
use crate::proc_cvar_auto::proc_cvar_auto;
use crate::proc_lvar2::proc_lvar2;
use crate::proc_lvar_auto::proc_lvar_auto;
use amino::{aa_seq, codon_to_aa};
use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence};
use enclone_core::allowed_vars::LVARS_ALLOWED;
use enclone_core::barcode_fate::BarcodeFate;
use enclone_core::defs::{AlleleData, ColInfo, EncloneControl, ExactClonotype, GexInfo, POUT_SEP};
Expand Down Expand Up @@ -564,7 +564,12 @@ pub fn row_fill(
// Speak some other column entries.

let xm = &ex.share[mid];
speakc!(u, col, "vj_aa".to_string(), stringme(&aa_seq(&xm.seq, 0)));
speakc!(
u,
col,
"vj_aa".to_string(),
stringme(&nucleotide_to_aminoacid_sequence(&xm.seq, 0))
);
speakc!(u, col, "vj_seq".to_string(), stringme(&xm.seq));
let mut dna = Vec::<u8>::new();
for p in xm.fr1_start..xm.seq_del_amino.len() {
Expand All @@ -578,7 +583,12 @@ pub fn row_fill(
dna.push(xm.seq_del_amino[p]);
}
}
speakc!(u, col, "vj_aa_nl".to_string(), stringme(&aa_seq(&dna, 0)));
speakc!(
u,
col,
"vj_aa_nl".to_string(),
stringme(&nucleotide_to_aminoacid_sequence(&dna, 0))
);
speakc!(u, col, "vj_seq_nl".to_string(), stringme(&dna));
speakc!(u, col, "seq".to_string(), stringme(&xm.full_seq));
let mut vv = Vec::<usize>::new();
Expand Down
28 changes: 14 additions & 14 deletions enclone_print/src/proc_cvar_auto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::print_utils1::{
test_internal_error_seq,
};
use crate::print_utils3::comp_edit;
use amino::{aa_seq, codon_to_aa};
use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence};
use enclone_core::align_to_vdj_ref::{align_to_vdj_ref, cigar};
use enclone_core::defs::{AlleleData, ColInfo, EncloneControl, ExactClonotype, POUT_SEP};
use enclone_core::median::rounded_median;
Expand Down Expand Up @@ -212,7 +212,7 @@ pub fn proc_cvar_auto(
let dna = refdata.refs[x.v_ref_id].to_ascii_vec()
[x.cdr1_start.unwrap()..x.fr2_start.unwrap()]
.to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if x.cdr2_start.is_some()
&& x.fr3_start.is_some()
Expand All @@ -221,7 +221,7 @@ pub fn proc_cvar_auto(
let dna = refdata.refs[x.v_ref_id].to_ascii_vec()
[x.cdr2_start.unwrap()..x.fr3_start.unwrap()]
.to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}

(y, Vec::new(), "clono".to_string())
Expand Down Expand Up @@ -264,9 +264,9 @@ pub fn proc_cvar_auto(
let arg1 = vname.between2("cdr", "_aa").force_i64();
let x = &ex.share[mid];
let y = if arg1 == 1 {
get_cdr1(x, 0, 0).map(|c| stringme(&aa_seq(c.as_bytes(), 0)))
get_cdr1(x, 0, 0).map(|c| stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0)))
} else if arg1 == 2 {
get_cdr2(x, 0, 0).map(|c| stringme(&aa_seq(c.as_bytes(), 0)))
get_cdr2(x, 0, 0).map(|c| stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0)))
} else {
Some(x.cdr3_aa.clone())
}
Expand All @@ -291,7 +291,7 @@ pub fn proc_cvar_auto(
get_cdr3(x, -1, -1)
};
let y = if let Some(c) = c {
stringme(&aa_seq(c.as_bytes(), 0))
stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0))
} else {
"unknown".to_string()
};
Expand Down Expand Up @@ -348,7 +348,7 @@ pub fn proc_cvar_auto(
}
}
test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?;
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if arg1 == 2 {
if x.cdr2_start.is_some()
Expand All @@ -372,7 +372,7 @@ pub fn proc_cvar_auto(
}
}
test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?;
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if x.cdr3_start as i64 - left >= 0
&& x.cdr3_start as i64 - left < x.seq_del_amino.len() as i64
Expand All @@ -395,7 +395,7 @@ pub fn proc_cvar_auto(
}
}
test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?;
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}

(y, Vec::new(), "exact".to_string())
Expand Down Expand Up @@ -848,7 +848,7 @@ pub fn proc_cvar_auto(
Some(stringme(dna))
};
let y = if let Some(c) = c {
stringme(&aa_seq(c.as_bytes(), 0))
stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0))
} else {
"unknown".to_string()
};
Expand All @@ -868,29 +868,29 @@ pub fn proc_cvar_auto(
let dna = refdata.refs[x.v_ref_id].to_ascii_vec()
[x.fr1_start..x.cdr1_start.unwrap()]
.to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if arg1 == 2 {
if x.fr2_start.unwrap() <= x.cdr2_start.unwrap() {
let dna = refdata.refs[x.v_ref_id].to_ascii_vec()
[x.fr2_start.unwrap()..x.cdr2_start.unwrap()]
.to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if arg1 == 3 {
if x.fr3_start.is_some() && x.fr3_start.unwrap() <= x.cdr3_start - x.ins_len() {
let dna = refdata.refs[x.v_ref_id].to_ascii_vec();
if x.cdr3_start <= dna.len() {
let dna = dna[x.fr3_start.unwrap()..x.cdr3_start - x.ins_len()].to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
}
} else {
let heavy = refdata.rtype[x.j_ref_id] == 0;
let aa_len = if heavy { 10 } else { 9 };
let dna = refdata.refs[x.j_ref_id].to_ascii_vec();
let dna = &dna[dna.len() - 1 - 3 * aa_len..dna.len() - 1];
y = stringme(&aa_seq(dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(dna, 0));
}

(y, Vec::new(), "clono".to_string())
Expand Down
Loading

0 comments on commit 3a2f1e6

Please sign in to comment.