Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: clarify CDR3 annotation algorithms #399

Merged
merged 5 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions amino/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,14 @@ pub fn codon_to_aa(codon: &[u8]) -> u8 {

// Convert a given DNA sequence to amino acids, starting at a given position.

pub fn aa_seq(x: &[u8], start: usize) -> Vec<u8> {
pub fn nucleotide_to_aminoacid_sequence(dna_seq: &[u8], start: usize) -> Vec<u8> {
let mut a = Vec::<u8>::new();
if x.len() >= 3 {
for j in (start..x.len() - 3 + 1).step_by(3) {
if x[j] == b'-' && x[j + 1] == b'-' && x[j + 2] == b'-' {
if dna_seq.len() >= 3 {
for pos in (start..dna_seq.len() - 3 + 1).step_by(3) {
if dna_seq[pos] == b'-' && dna_seq[pos + 1] == b'-' && dna_seq[pos + 2] == b'-' {
a.push(b'-');
} else {
a.push(codon_to_aa(&x[j..j + 3]));
a.push(codon_to_aa(&dna_seq[pos..pos + 3]));
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions enclone/src/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use enclone_core::barcode_fate::BarcodeFate;
use vdj_ann::refx;

use self::refx::RefData;
use amino::{aa_seq, codon_to_aa};
use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence};
use ansi_escape::emit_end_escape;
use debruijn::{dna_string::DnaString, Mer};
use enclone_core::defs::{CloneInfo, EncloneControl, ExactClonotype};
Expand Down Expand Up @@ -129,8 +129,8 @@ pub fn build_info(

// Optimize to compute entry in aa_mod_indel and the inserted aa sequence.

let aa_full = aa_seq(&x.seq, 0);
let ref_aa = aa_seq(&refdata.refs[vid].to_ascii_vec(), 0);
let aa_full = nucleotide_to_aminoacid_sequence(&x.seq, 0);
let ref_aa = nucleotide_to_aminoacid_sequence(&refdata.refs[vid].to_ascii_vec(), 0);
let ins_len_aa = ins_len / 3;
const EXT: usize = 10;
let ins_pos_low = if ins_pos / 3 < EXT {
Expand Down Expand Up @@ -179,7 +179,7 @@ pub fn build_info(
lens.push(x.seq.len());
tigs.push(x.seq.clone());
tigs_amino.push(x.seq.clone());
aa_mod_indel.push(aa_seq(&x.seq, 0));
aa_mod_indel.push(nucleotide_to_aminoacid_sequence(&x.seq, 0));
tigs_ins.push(Vec::new());
}

Expand Down
4 changes: 2 additions & 2 deletions enclone/src/misc2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

use crate::innate::mark_innate;
use crate::misc3::study_consensus;
use amino::aa_seq;
use amino::nucleotide_to_aminoacid_sequence;
use debruijn::dna_string::DnaString;
use enclone_core::barcode_fate::BarcodeFate;
use enclone_core::defs::{EncloneControl, ExactClonotype, Junction, TigData, TigData0, TigData1};
Expand Down Expand Up @@ -180,7 +180,7 @@ pub fn create_exact_subclonotype_core(
// are the same, which in principle they should be, but this is not actually always true.
// However this is hard to fix.

let aa = aa_seq(tig_bc[r][m].seq(), 0);
let aa = nucleotide_to_aminoacid_sequence(tig_bc[r][m].seq(), 0);
let mut d_start = None;
if tig_bc[r][m].d_start.is_some() {
d_start = Some(tig_bc[r][m].d_start.unwrap() + utr.len() - tig_bc[r][m].v_start);
Expand Down
20 changes: 10 additions & 10 deletions enclone_args/src/read_json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,10 @@ fn process_json_annotation(
} else if !is_productive_contig(&x, refdata, &ann1).0 {
return Ok(res);
}
let mut cdr3 = Vec::<(usize, Vec<u8>, usize, usize)>::new();
get_cdr3_using_ann(&x, refdata, &ann1, &mut cdr3);
cdr3_aa = stringme(&cdr3[0].1);
cdr3_start = cdr3[0].0;
let found_cdr3s = get_cdr3_using_ann(&x, refdata, &ann1);
let cdr3 = found_cdr3s.first().unwrap();
cdr3_aa = stringme(&cdr3.aa_seq);
cdr3_start = cdr3.start_position_on_contig;
cdr3_dna = x
.slice(cdr3_start, cdr3_start + 3 * cdr3_aa.len())
.to_string();
Expand Down Expand Up @@ -385,23 +385,23 @@ fn process_json_annotation(
// than is used in the current version of enclone. This could result in internal
// inconsistencies, leading to an assert somewhere downstream.

let mut cdr3 = Vec::<(usize, Vec<u8>, usize, usize)>::new();
let x = DnaString::from_dna_string(&ann.sequence);
get_cdr3_using_ann(&x, refdata, &annv, &mut cdr3);
if cdr3.is_empty() {
let found_cdr3s = get_cdr3_using_ann(&x, refdata, &annv);
if found_cdr3s.is_empty() {
return Ok(res);
}
let cdr3_aa_alt = stringme(&cdr3[0].1);
let cdr3 = found_cdr3s.first().unwrap();
let cdr3_aa_alt = stringme(&cdr3.aa_seq);
if cdr3_aa != cdr3_aa_alt {
// This is particularly pathological and rare:

if tig_start as usize > cdr3[0].0 {
if tig_start as usize > cdr3.start_position_on_contig {
return Ok(res);
}

// Define start.

cdr3_start = cdr3[0].0 - tig_start as usize;
cdr3_start = cdr3.start_position_on_contig - tig_start as usize;

// Define cdr3.

Expand Down
4 changes: 2 additions & 2 deletions enclone_core/src/mammalian_fixed_len.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Copyright (c) 2021 10X Genomics, Inc. All rights reserved.

use amino::aa_seq;
use amino::nucleotide_to_aminoacid_sequence;
use std::collections::HashMap;
use string_utils::TextUtils;
use superslice::Ext;
Expand Down Expand Up @@ -54,7 +54,7 @@ pub fn mammalian_fixed_len_peer_groups(refdata: &RefData) -> Vec<Vec<(usize, u8,
let mut pg = vec![Vec::<(usize, u8, u32)>::new(); refdata.refs.len()];
for (i, pgi) in pg.iter_mut().enumerate() {
if refdata.is_v(i) {
let aa = aa_seq(&refdata.refs[i].to_ascii_vec(), 0);
let aa = nucleotide_to_aminoacid_sequence(&refdata.refs[i].to_ascii_vec(), 0);
let rtype = refdata.rtype[i];
let chain_type = if rtype == 0 {
"IGH"
Expand Down
16 changes: 13 additions & 3 deletions enclone_print/src/print_utils2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::print_utils1::color_codon;
use crate::proc_cvar_auto::proc_cvar_auto;
use crate::proc_lvar2::proc_lvar2;
use crate::proc_lvar_auto::proc_lvar_auto;
use amino::{aa_seq, codon_to_aa};
use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence};
use enclone_core::allowed_vars::LVARS_ALLOWED;
use enclone_core::barcode_fate::BarcodeFate;
use enclone_core::defs::{AlleleData, ColInfo, EncloneControl, ExactClonotype, GexInfo, POUT_SEP};
Expand Down Expand Up @@ -564,7 +564,12 @@ pub fn row_fill(
// Speak some other column entries.

let xm = &ex.share[mid];
speakc!(u, col, "vj_aa".to_string(), stringme(&aa_seq(&xm.seq, 0)));
speakc!(
u,
col,
"vj_aa".to_string(),
stringme(&nucleotide_to_aminoacid_sequence(&xm.seq, 0))
);
speakc!(u, col, "vj_seq".to_string(), stringme(&xm.seq));
let mut dna = Vec::<u8>::new();
for p in xm.fr1_start..xm.seq_del_amino.len() {
Expand All @@ -578,7 +583,12 @@ pub fn row_fill(
dna.push(xm.seq_del_amino[p]);
}
}
speakc!(u, col, "vj_aa_nl".to_string(), stringme(&aa_seq(&dna, 0)));
speakc!(
u,
col,
"vj_aa_nl".to_string(),
stringme(&nucleotide_to_aminoacid_sequence(&dna, 0))
);
speakc!(u, col, "vj_seq_nl".to_string(), stringme(&dna));
speakc!(u, col, "seq".to_string(), stringme(&xm.full_seq));
let mut vv = Vec::<usize>::new();
Expand Down
28 changes: 14 additions & 14 deletions enclone_print/src/proc_cvar_auto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::print_utils1::{
test_internal_error_seq,
};
use crate::print_utils3::comp_edit;
use amino::{aa_seq, codon_to_aa};
use amino::{codon_to_aa, nucleotide_to_aminoacid_sequence};
use enclone_core::align_to_vdj_ref::{align_to_vdj_ref, cigar};
use enclone_core::defs::{AlleleData, ColInfo, EncloneControl, ExactClonotype, POUT_SEP};
use enclone_core::median::rounded_median;
Expand Down Expand Up @@ -212,7 +212,7 @@ pub fn proc_cvar_auto(
let dna = refdata.refs[x.v_ref_id].to_ascii_vec()
[x.cdr1_start.unwrap()..x.fr2_start.unwrap()]
.to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if x.cdr2_start.is_some()
&& x.fr3_start.is_some()
Expand All @@ -221,7 +221,7 @@ pub fn proc_cvar_auto(
let dna = refdata.refs[x.v_ref_id].to_ascii_vec()
[x.cdr2_start.unwrap()..x.fr3_start.unwrap()]
.to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}

(y, Vec::new(), "clono".to_string())
Expand Down Expand Up @@ -264,9 +264,9 @@ pub fn proc_cvar_auto(
let arg1 = vname.between2("cdr", "_aa").force_i64();
let x = &ex.share[mid];
let y = if arg1 == 1 {
get_cdr1(x, 0, 0).map(|c| stringme(&aa_seq(c.as_bytes(), 0)))
get_cdr1(x, 0, 0).map(|c| stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0)))
} else if arg1 == 2 {
get_cdr2(x, 0, 0).map(|c| stringme(&aa_seq(c.as_bytes(), 0)))
get_cdr2(x, 0, 0).map(|c| stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0)))
} else {
Some(x.cdr3_aa.clone())
}
Expand All @@ -291,7 +291,7 @@ pub fn proc_cvar_auto(
get_cdr3(x, -1, -1)
};
let y = if let Some(c) = c {
stringme(&aa_seq(c.as_bytes(), 0))
stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0))
} else {
"unknown".to_string()
};
Expand Down Expand Up @@ -348,7 +348,7 @@ pub fn proc_cvar_auto(
}
}
test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?;
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if arg1 == 2 {
if x.cdr2_start.is_some()
Expand All @@ -372,7 +372,7 @@ pub fn proc_cvar_auto(
}
}
test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?;
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if x.cdr3_start as i64 - left >= 0
&& x.cdr3_start as i64 - left < x.seq_del_amino.len() as i64
Expand All @@ -395,7 +395,7 @@ pub fn proc_cvar_auto(
}
}
test_internal_error_seq(&x.seq, &dna, &x.cdr3_aa)?;
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}

(y, Vec::new(), "exact".to_string())
Expand Down Expand Up @@ -848,7 +848,7 @@ pub fn proc_cvar_auto(
Some(stringme(dna))
};
let y = if let Some(c) = c {
stringme(&aa_seq(c.as_bytes(), 0))
stringme(&nucleotide_to_aminoacid_sequence(c.as_bytes(), 0))
} else {
"unknown".to_string()
};
Expand All @@ -868,29 +868,29 @@ pub fn proc_cvar_auto(
let dna = refdata.refs[x.v_ref_id].to_ascii_vec()
[x.fr1_start..x.cdr1_start.unwrap()]
.to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if arg1 == 2 {
if x.fr2_start.unwrap() <= x.cdr2_start.unwrap() {
let dna = refdata.refs[x.v_ref_id].to_ascii_vec()
[x.fr2_start.unwrap()..x.cdr2_start.unwrap()]
.to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
} else if arg1 == 3 {
if x.fr3_start.is_some() && x.fr3_start.unwrap() <= x.cdr3_start - x.ins_len() {
let dna = refdata.refs[x.v_ref_id].to_ascii_vec();
if x.cdr3_start <= dna.len() {
let dna = dna[x.fr3_start.unwrap()..x.cdr3_start - x.ins_len()].to_vec();
y = stringme(&aa_seq(&dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(&dna, 0));
}
}
} else {
let heavy = refdata.rtype[x.j_ref_id] == 0;
let aa_len = if heavy { 10 } else { 9 };
let dna = refdata.refs[x.j_ref_id].to_ascii_vec();
let dna = &dna[dna.len() - 1 - 3 * aa_len..dna.len() - 1];
y = stringme(&aa_seq(dna, 0));
y = stringme(&nucleotide_to_aminoacid_sequence(dna, 0));
}

(y, Vec::new(), "clono".to_string())
Expand Down
Loading
Loading