Skip to content

Commit

Permalink
tools: Normalize sv for non-symbolic variants. #TASK-6558
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Jul 18, 2024
1 parent a6abc51 commit 58bee08
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -301,19 +301,16 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
normalizedVariants.add(variant);
continue;
}
String reference = variant.getReference(); //Save original values, as they can be changed
//Save original values, as they can be changed
String reference = variant.getReference();
String alternate = variant.getAlternate();
Integer start = variant.getStart();
Integer end = variant.getEnd();
String chromosome = variant.getChromosome();

if (variant.getStudies() == null || variant.getStudies().isEmpty()) {
List<VariantKeyFields> keyFieldsList;
if (isSymbolic(variant)) {
keyFieldsList = normalizeSymbolic(start, end, reference, alternate, variant.getSv());
} else {
keyFieldsList = normalize(chromosome, start, reference, alternate);
}
List<VariantKeyFields> keyFieldsList = normalizeAlleles(variant);

// Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order!
for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) {
OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele());
Expand All @@ -331,25 +328,16 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
normalizedVariants.add(normalizedVariant);
}
} else {
for (StudyEntry entry : variant.getStudies()) {
List<String> originalAlternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
List<String> alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
alternates.add(alternate);
originalAlternates.add(alternate);
for (String secondaryAlternatesAllele : entry.getSecondaryAlternatesAlleles()) {
alternates.add(secondaryAlternatesAllele);
originalAlternates.add(secondaryAlternatesAllele);
}
if (variant.getStudies().size() != 1) {
throw new IllegalStateException("Only one study per variant is supported when normalizing variants. Found "
+ variant.getStudies().size() + " studies. Variant: " + variant);
} else {
StudyEntry entry = variant.getStudies().get(0);
List<String> alternates = getAllAlternates(variant);

// FIXME: assumes there wont be multinucleotide positions with CNVs and short variants mixed
List<VariantKeyFields> keyFieldsList;
List<VariantKeyFields> originalKeyFieldsList;
if (isSymbolic(variant)) {
keyFieldsList = normalizeSymbolic(start, end, reference, alternates, variant.getSv());
} else {
keyFieldsList = normalize(chromosome, start, reference, alternates);
}
originalKeyFieldsList = keyFieldsList
List<VariantKeyFields> keyFieldsList = normalizeAlleles(variant);
List<VariantKeyFields> originalKeyFieldsList = keyFieldsList
.stream()
.filter(k -> !k.isReferenceBlock())
.map(k -> k.originalKeyFields)
Expand All @@ -372,8 +360,8 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
originalCall = entry.getFiles().get(0).getCall().getVariantId();
} else {
StringBuilder sb = new StringBuilder(variant.toString());
for (int i = 1; i < originalAlternates.size(); i++) {
sb.append(",").append(originalAlternates.get(i));
for (int i = 1; i < alternates.size(); i++) {
sb.append(",").append(alternates.get(i));
}
originalCall = sb.toString();
}
Expand Down Expand Up @@ -600,17 +588,54 @@ private Collection<VariantKeyFields> sortByPosition(List<VariantKeyFields> keyFi
// }
// }

protected List<VariantKeyFields> normalizeAlleles(Variant variant) {
List<String> alternates = getAllAlternates(variant);

List<VariantKeyFields> keyFieldsList;
if (isSymbolic(variant)) {
keyFieldsList = normalizeSymbolic(variant.getStart(), variant.getEnd(), variant.getReference(), alternates, variant.getSv());
} else {
keyFieldsList = normalize(variant.getChromosome(), variant.getStart(), variant.getReference(), alternates, variant.getSv());
}
return keyFieldsList;
}

private static List<String> getAllAlternates(Variant variant) {
List<String> alternates;
if (variant.getStudies() != null && !variant.getStudies().isEmpty()) {
StudyEntry entry = variant.getStudies().get(0);
String alternate = variant.getAlternate();
alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
alternates.add(alternate);
for (AlternateCoordinate secondaryAlternate : entry.getSecondaryAlternates()) {
if (secondaryAlternate.getStart() != null && !secondaryAlternate.getStart().equals(variant.getStart())) {
throw new IllegalStateException("Unable to normalize variant where secondary alternates do not start at the same position. "
+ "Variant: " + variant + " , secondaryAlternate: " + secondaryAlternate);
}
if (secondaryAlternate.getEnd() != null && !secondaryAlternate.getEnd().equals(variant.getEnd())) {
throw new IllegalStateException("Unable to normalize variant where secondary alternates do not end at the same position. "
+ "Variant: " + variant + " (end=" + variant.getEnd() + ") , secondaryAlternate: " + secondaryAlternate);
}
alternates.add(secondaryAlternate.getAlternate());
}
} else {
alternates = Collections.singletonList(variant.getAlternate());
}
return Collections.unmodifiableList(alternates);
}

@Deprecated // Test purposes only
public List<VariantKeyFields> normalizeSymbolic(Integer start, Integer end, String reference, String alternate, StructuralVariation sv) {
return normalizeSymbolic(start, end, reference, Collections.singletonList(alternate), sv);
}

@Deprecated
@Deprecated // Test purposes only
public List<VariantKeyFields> normalizeSymbolic(final Integer start, final Integer end, final String reference,
final List<String> alternates) {
return normalizeSymbolic(start, end, reference, alternates, null);
}

public List<VariantKeyFields> normalizeSymbolic(final Integer start, final Integer end, final String reference,
protected List<VariantKeyFields> normalizeSymbolic(final Integer start, final Integer end, final String reference,
final List<String> alternates, StructuralVariation sv) {
List<VariantKeyFields> list = new ArrayList<>(alternates.size());

Expand All @@ -634,44 +659,48 @@ public List<VariantKeyFields> normalizeSymbolic(final Integer start, final Integ
keyFields.getSv().setType(StructuralVariantType.TANDEM_DUPLICATION);
}

if (sv != null) {
StructuralVariation normalizedSv = keyFields.getSv();
if (normalizedSv == null) {
normalizedSv = new StructuralVariation();
}
// CI positions may change during the normalization. Update them.
normalizedSv.setCiStartLeft(sv.getCiStartLeft());
normalizedSv.setCiStartRight(sv.getCiStartRight());

// Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND.
// At this point, we're removing the CIEND from the normalized variant.
// Do not remove the value from the INFO field (if any).
// The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start")
if (keyFields.getEnd() < keyFields.getStart()) {
normalizedSv.setCiEndLeft(null);
normalizedSv.setCiEndRight(null);
} else {
normalizedSv.setCiEndLeft(sv.getCiEndLeft());
normalizedSv.setCiEndRight(sv.getCiEndRight());
}
normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq());
normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq());

if (keyFields.getSv() == null) {
if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null
|| normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null
|| normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) {
keyFields.setSv(normalizedSv);
}
}
}
normalizeSvField(sv, keyFields);

list.add(keyFields);
}

return list;
}

private static void normalizeSvField(StructuralVariation sv, VariantKeyFields keyFields) {
if (sv != null) {
StructuralVariation normalizedSv = keyFields.getSv();
if (normalizedSv == null) {
normalizedSv = new StructuralVariation();
}
// CI positions may change during the normalization. Update them.
normalizedSv.setCiStartLeft(sv.getCiStartLeft());
normalizedSv.setCiStartRight(sv.getCiStartRight());

// Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND.
// At this point, we're removing the CIEND from the normalized variant.
// Do not remove the value from the INFO field (if any).
// The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start")
if (keyFields.getEnd() < keyFields.getStart()) {
normalizedSv.setCiEndLeft(null);
normalizedSv.setCiEndRight(null);
} else {
normalizedSv.setCiEndLeft(sv.getCiEndLeft());
normalizedSv.setCiEndRight(sv.getCiEndRight());
}
normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq());
normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq());

if (keyFields.getSv() == null) {
if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null
|| normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null
|| normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) {
keyFields.setSv(normalizedSv);
}
}
}
}

private boolean isNonRef(String alternate) {
return alternate.equals(Allele.NO_CALL_STRING)
|| alternate.equals(VariantBuilder.NON_REF_ALT)
Expand Down Expand Up @@ -780,12 +809,17 @@ private VariantKeyFields normalizeSymbolic(
}


@Deprecated // Test purposes only
public List<VariantKeyFields> normalize(String chromosome, int position, String reference, String alternate) {
return normalize(chromosome, position, reference, Collections.singletonList(alternate));
return normalize(chromosome, position, reference, Collections.singletonList(alternate), null);
}

public List<VariantKeyFields> normalize(String chromosome, int position, String reference, List<String> alternates)
{
@Deprecated // Test purposes only
public List<VariantKeyFields> normalize(String chromosome, int position, String reference, List<String> alternates) {
return normalize(chromosome, position, reference, alternates, null);
}

protected List<VariantKeyFields> normalize(String chromosome, int position, String reference, List<String> alternates, StructuralVariation sv) {

List<VariantKeyFields> list = new ArrayList<>(alternates.size());
int numAllelesIdx = 0; // This index is necessary for getting the samples where the mutated allele is present
Expand Down Expand Up @@ -829,6 +863,8 @@ public List<VariantKeyFields> normalize(String chromosome, int position, String
}
}

normalizeSvField(sv, keyFields);

if (keyFields != null) {

// To deal with cases such as A>GT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,32 @@ public void testINSsNormalizationWithCIEND() throws Exception {
});
}

@Test
public void testNormalizeNonSymbolicInsertion() throws Exception {
Variant variant = newVariantBuilder(100, null, "C", Collections.singletonList("CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), "2")
.addFileData("CIPOS", "-14,50")
.addFileData("CIEND", "-50,11")
.addSample("HG00096", "0|0")
.build();

normalizeOne(variant, normalizedVariant -> {
assertEquals(new StructuralVariation(86, 150, null, null, null, null, null, null, null), normalizedVariant.getSv());
});
}

@Test
public void testNormalizeNonSymbolicDeletion() throws Exception {
Variant variant = newVariantBuilder(100, null, "CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "C", "2")
.addFileData("CIPOS", "-14,50")
.addFileData("CIEND", "-1,1")
.addSample("HG00096", "0|1")
.build();

normalizeOne(variant, normalizedVariant -> {
assertEquals(new StructuralVariation(86, 150, 179, 181, null, null, null, null, null), normalizedVariant.getSv());
});
}

@Test
public void testDUPTANDEMNormalization() throws Exception {
Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList("<DUP:TANDEM>"), "2")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,8 @@ public void testMergeIndelCase1() throws NonStandardCompliantSampleField {
Variant v1 = VariantTestUtils.generateVariantWithFormat("1:328:CTT:C",
VCFConstants.GENOTYPE_KEY + "," + VCFConstants.GENOTYPE_FILTER_KEY,
"S1", "1/2","PASS");
v1.getStudies().get(0).getSecondaryAlternates().add(new AlternateCoordinate(null,null,331,"CTT", "CTTTC", VariantType.INDEL));

v1.getStudies().get(0).getSecondaryAlternates().add(new AlternateCoordinate(null, null, 330, "CTT", "CTTTC", VariantType.INDEL));

Variant v2 = VariantTestUtils.generateVariantWithFormat("1:331:T:TCT",
VCFConstants.GENOTYPE_KEY + "," + VCFConstants.GENOTYPE_FILTER_KEY,
Expand Down

0 comments on commit 58bee08

Please sign in to comment.