Skip to content

Commit

Permalink
[TEST] Add some test cases
Browse files Browse the repository at this point in the history
Signed-off-by: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
  • Loading branch information
Irallia committed Aug 19, 2021
1 parent e37ebc5 commit 5cd7ed5
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 1 deletion.
149 changes: 149 additions & 0 deletions test/api/detection_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,155 @@ TEST(junction_detection, cigar_string_del_padding)
}
}

TEST(junction_detection, cigar_string_dup_detect_tandem_duplication)
{
seqan3::dna5_vector const & query_sequence = {"AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC"_dna5};
int32_t length = 15;
int32_t pos_start_dup_seq = 0;
int32_t pos_end_dup_seq = 0;
size_t tandem_dup_count = 0;

// Prefix Case: The duplication (insertion) comes after the matched sequence.
// (10, 20] -> pos_start_dup_seq, pos_end_dup_seq
// ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG
// ||||||||||||||||||| ||||||||
// read AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC -> inserted sequence: GCGGGGCGGGGCGGG
//
// suffix_sequence AAAACCGCGTAGCGGGGCGGG
// ||||||||||
// inserted sequence: GCGGGGCGGGGCGGG
// -> tandem_dup_count = 5, inserted_bases = GCGGG
int32_t pos_ref = 21;
int32_t pos_read = 19;
std::span<const seqan3::dna5> inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); // "GCGGGGCGGG"_dna5

auto duplicated_bases = detect_tandem_duplication(query_sequence,
length,
pos_ref,
pos_read,
inserted_bases,
4, // minimal length of duplication
pos_start_dup_seq,
pos_end_dup_seq,
tandem_dup_count);
EXPECT_EQ(tandem_dup_count, 5);
EXPECT_EQ(pos_start_dup_seq, 10);
EXPECT_EQ(pos_end_dup_seq, 20);

// Suffix Case: The duplication (insertion) comes before the matched sequence.
// (10, 20] -> pos_start_dup_seq, pos_end_dup_seq
// ref AAAACCGCGTA---------------GCGGGGCGGGTACGTAACGGTACG
// ||||||||| ||||||||||||||||||
// read AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC -> inserted sequence: GCGGGGCGGGGCGGG
//
// prefix_sequence GCGGGGCGGGTACGTAACGGTACG
// ||||||||||
// inserted sequence: GCGGGGCGGGGCGGG
// -> tandem_dup_count = 5, inserted_bases = GCGGG
pos_ref = 11; // position of first inserted base.
pos_read = 9;
inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length);
duplicated_bases = detect_tandem_duplication(query_sequence,
length,
pos_ref,
pos_read,
inserted_bases,
4, // minimal length of duplication
pos_start_dup_seq,
pos_end_dup_seq,
tandem_dup_count);
EXPECT_EQ(tandem_dup_count, 5);
EXPECT_EQ(pos_start_dup_seq, 10);
EXPECT_EQ(pos_end_dup_seq, 20);

// Special Case: The duplication (insertion) is in between of matching sequences.
// (10, 25] -> pos_start_dup_seq, pos_end_dup_seq
// ref AAAACCGCGTAGCGGG----------GCGGGGCGGGTACGTAACGGTACG
// |||||||||||||| |||||||||||||
// read AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC -> inserted sequence: GCGGGGCGGG
//
// suffix_sequence AAAACCGCGTAGCGGG prefix_sequence GCGGGGCGGGTACGTAACGGTACG
// ||||| ||||||||||
// inserted sequence: GCGGGGCGGG GCGGGGCGGG
// -> tandem_dup_count = 5, inserted_bases = GCGGG
length = 10;
pos_ref = 16;
pos_read = 14;
inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); // "GCGGG"_dna5

duplicated_bases = detect_tandem_duplication(query_sequence,
length,
pos_ref,
pos_read,
inserted_bases,
4, // minimal length of duplication
pos_start_dup_seq,
pos_end_dup_seq,
tandem_dup_count);
EXPECT_EQ(tandem_dup_count, 5);
EXPECT_EQ(pos_start_dup_seq, 10);
EXPECT_EQ(pos_end_dup_seq, 25);

// Case with errors at the end: The duplication (insertion) includes some errors at the end.
// (10, 20] -> pos_start_dup_seq, pos_end_dup_seq
// ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG
// ||||||||||||||||||| ||||||||
// read AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCAAATACGTAAC -> inserted sequence: GCGGGGCGGGGCAAA
//
// suffix_sequence AAAACCGCGTAGCGGGGCGGG
// ||||||||||
// inserted sequence: GCGGGGCGGGGCAAA
//
// suffix_sequence AAAACCGCGTAGCGGGGCGGG
// ||--- -> subsequence does not match without mismatches
// inserted sequence: GCAAA
//
seqan3::dna5_vector const & query_sequence_2 = {"AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCAAATACGTAAC"_dna5};
length = 15;
pos_ref = 21;
pos_read = 19;
inserted_bases = query_sequence_2 | seqan3::views::slice(pos_read, pos_read + length); // "GCGGGGCGGGGCAAA"_dna5

duplicated_bases = detect_tandem_duplication(query_sequence_2,
length,
pos_ref,
pos_read,
inserted_bases,
4, // minimal length of duplication
pos_start_dup_seq,
pos_end_dup_seq,
tandem_dup_count);
EXPECT_EQ(tandem_dup_count, 0);
EXPECT_EQ(pos_start_dup_seq, 20);
EXPECT_EQ(pos_end_dup_seq, 20);

// Case with a single error: The duplication (insertion) includes a single errors.
// (10, 20] -> pos_start_dup_seq, pos_end_dup_seq
// ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG
// ||||||||||||||||||| ||||||||
// read AACCGCGTAGCGGGGCGGGGCGAGGCGGGGCGGGTACGTAAC -> inserted sequence: GCGAGGCGGGGCGGG
//
// suffix_sequence AAAACCGCGTAGCGGGGCGGG
// |||-|||||| -> single mismatch in the alignment
// inserted sequence: GCGAGGCGGGGCGGG
//
seqan3::dna5_vector const & query_sequence_3 = {"AACCGCGTAGCGGGGCGGGGCGAGGCGGGGCGGGTACGTAAC"_dna5};
inserted_bases = query_sequence_3 | seqan3::views::slice(pos_read, pos_read + length); // "GCGAGGCGGGGCGGG"_dna5

duplicated_bases = detect_tandem_duplication(query_sequence_3,
length,
pos_ref,
pos_read,
inserted_bases,
4, // minimal length of duplication
pos_start_dup_seq,
pos_end_dup_seq,
tandem_dup_count);
EXPECT_EQ(tandem_dup_count, 0);
EXPECT_EQ(pos_start_dup_seq, 20);
EXPECT_EQ(pos_end_dup_seq, 20);
}

TEST(junction_detection, cigar_string_simple_ins)
{
std::string const read_name = "read021";
Expand Down
46 changes: 45 additions & 1 deletion test/api/structures_test.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
#include <gtest/gtest.h>

#include "structures/breakend.hpp"
#include "structures/cluster.hpp"
#include "structures/junction.hpp"
#include "variant_detection/method_enums.hpp"

#include <seqan3/alphabet/nucleotide/dna5.hpp>

using seqan3::operator""_dna5;

/* tests for method_enums */

TEST(structures, method_enums_detection_methods)
Expand Down Expand Up @@ -75,7 +81,7 @@ TEST(structures, method_enums_refinement_methods)

/* tests for junctions */

TEST(structures, breakend_flip_orientation)
TEST(structures, junctions_breakend_flip_orientation)
{
Breakend forward_breakend{"chr1", 42, strand::forward};
Breakend reverse_breakend{"chr1", 42, strand::reverse};
Expand All @@ -88,3 +94,41 @@ TEST(structures, breakend_flip_orientation)
forward_breakend.flip_orientation();
EXPECT_EQ(forward_breakend, reverse_breakend); // both are forward now
}

/* tests for clusters */

TEST(structures, clusters_get_common_tandem_dup_count)
{
Junction junction_1{Breakend{"chr1", 123456, strand::forward},
Breakend{"chr1", 234567, strand::forward},
""_dna5, 0, "read_1"};
Junction junction_2{Breakend{"chr1", 123456, strand::forward},
Breakend{"chr1", 234567, strand::forward},
""_dna5, 1, "read_1"};
Junction junction_3{Breakend{"chr1", 123456, strand::forward},
Breakend{"chr1", 234567, strand::forward},
""_dna5, 2, "read_1"};
Junction junction_4{Breakend{"chr1", 123456, strand::forward},
Breakend{"chr1", 234567, strand::forward},
""_dna5, 7, "read_1"};

// Tandem duplication with 2 copies:
Cluster cluster_1{{junction_3, junction_3, junction_3}};
EXPECT_EQ(cluster_1.get_common_tandem_dup_count(), 2);

// Tandem duplication with different amount of copies:
Cluster cluster_2{{junction_2, junction_3, junction_4}};
EXPECT_EQ(cluster_2.get_common_tandem_dup_count(), 3); // (1+2+7)/3=3,333

// More than two thirds of the junctions have a 0 tandem_dup_count, than its probably no tandem duplication.
Cluster cluster_3{{junction_1, junction_1, junction_1, junction_2}};
EXPECT_EQ(cluster_3.get_common_tandem_dup_count(), 0);

// Two thirds of the junctions have a 0 tandem_dup_count, than its probably a tandem duplication.
Cluster cluster_4{{junction_1, junction_1, junction_2}};
EXPECT_EQ(cluster_4.get_common_tandem_dup_count(), 0);

// Less than two thirds of the junctions have a 0 tandem_dup_count, than its probably a tandem duplication.
Cluster cluster_5{{junction_1, junction_2}};
EXPECT_EQ(cluster_5.get_common_tandem_dup_count(), 1);
}

0 comments on commit 5cd7ed5

Please sign in to comment.