From 7713d8be3aa11ec3be883273d212ef10387af5b6 Mon Sep 17 00:00:00 2001
From: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
Date: Mon, 25 Oct 2021 17:09:59 +0200
Subject: [PATCH 1/4] [FEATURE] detect tandem duplications via tandem_dup_count

---
 .../analyze_cigar_method.cpp                  | 46 ++++++++++++++-----
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/src/modules/sv_detection_methods/analyze_cigar_method.cpp b/src/modules/sv_detection_methods/analyze_cigar_method.cpp
index 368e2384..a85a9e37 100644
--- a/src/modules/sv_detection_methods/analyze_cigar_method.cpp
+++ b/src/modules/sv_detection_methods/analyze_cigar_method.cpp
@@ -27,30 +27,54 @@ void analyze_cigar(std::string const & read_name,
         using seqan3::get;
         int32_t length = get<0>(pair);
         seqan3::cigar::operation operation = get<1>(pair);
-        size_t tandem_dup_count = 0;
         if (operation == 'M'_cigar_operation || operation == '='_cigar_operation || operation == 'X'_cigar_operation)
         {
             pos_ref += length;
             pos_read += length;
         }
+        // -------------------------------------
         else if (operation == 'I'_cigar_operation) // I: Insertion (gap in the reference sequence)
         {
             if (length >= min_length)
             {
                 // Insertions cause one junction from the insertion location to the next base
                 auto inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length);
-                Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward},
-                                      Breakend{chromosome, pos_ref, strand::forward},
-                                      inserted_bases,
-                                      tandem_dup_count,
-                                      read_name};
-                if (gVerbose)
-                    seqan3::debug_stream << "INS: " << new_junction << "\n";
-                junctions.push_back(std::move(new_junction));
+        // ---------------- DUP ----------------
+                // Case - Duplication: The inserted bases include one or more copies of a duplicated sequence, with an
+                //                     origin somewhere else. -> global alignment
+                // ##ALT=<ID=DUP,Description="Duplication">
+                // TODO (23.7.21, irallia): This is Part of the Epic #144. Do we need a reference sequence for this?
+        // ---------------- DUP:TANDEM ----------------
+                size_t tandem_dup_count = 0;
+                int32_t pos_start_dup_seq{};
+                int32_t pos_end_dup_seq{};
+                // duplicated_bases = detect_tandem_duplication()
+                if (tandem_dup_count != 0)
+                {
+                    Junction new_junction{Breakend{chromosome, pos_start_dup_seq, strand::forward},
+                                          Breakend{chromosome, pos_end_dup_seq, strand::forward},
+                                          inserted_bases, // duplicated_bases,
+                                          tandem_dup_count,
+                                          read_name};
+                    if (gVerbose)
+                        seqan3::debug_stream << "DUP:TANDEM: " << new_junction << "\n";
+                    junctions.push_back(std::move(new_junction));
+                } else {
+        // ---------------- INS ----------------
+                    Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward},
+                                          Breakend{chromosome, pos_ref, strand::forward},
+                                          inserted_bases,
+                                          0,
+                                          read_name};
+                    if (gVerbose)
+                        seqan3::debug_stream << "INS: " << new_junction << "\n";
+                    junctions.push_back(std::move(new_junction));
+                }
             }
             pos_read += length;
         }
-        else if (operation == 'D'_cigar_operation)
+        // ---------------- DEL ----------------
+        else if (operation == 'D'_cigar_operation) // D: Deletion (gap in the read)
         {
             if (length >= min_length)
             {
@@ -58,7 +82,7 @@ void analyze_cigar(std::string const & read_name,
                 Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward},
                                       Breakend{chromosome, pos_ref + length, strand::forward},
                                       ""_dna5,
-                                      tandem_dup_count,
+                                      0,
                                       read_name};
                 if (gVerbose)
                     seqan3::debug_stream << "DEL: " << new_junction << "\n";

From 71d915e6951d4d79acb7bef9bab8c47fc756c9e3 Mon Sep 17 00:00:00 2001
From: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
Date: Tue, 2 Nov 2021 15:24:15 +0100
Subject: [PATCH 2/4] [FEATURE] detect_tandem_duplication in suffix and prefix
 case

---
 .../analyze_cigar_method.hpp                  | 54 +++++++++++++++++++
 .../analyze_cigar_method.cpp                  | 43 +++++++++++++--
 2 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/include/modules/sv_detection_methods/analyze_cigar_method.hpp b/include/modules/sv_detection_methods/analyze_cigar_method.hpp
index 7ab08958..c8b9cfec 100644
--- a/include/modules/sv_detection_methods/analyze_cigar_method.hpp
+++ b/include/modules/sv_detection_methods/analyze_cigar_method.hpp
@@ -2,6 +2,60 @@
 
 #include "structures/junction.hpp"  // for class Junction
 
+
+/*! \brief This function checks if the inserted bases are tandem duplicated.
+ *
+ * \param[in]       query_sequence      - SEQ field of the SAM/BAM file
+ * \param[in]       length              - length of inserted part, given by the CIGAR
+ * \param[in]       pos_ref             - position of the inserted part in the ref (current position)
+ * \param[in]       pos_read            - position of the inserted part in the read (current position)
+ * \param[in]       inserted_bases      - the inserted bases of the possible duplication
+ * \param[in]       min_length          - minimum length of variants to detect (default 30 bp,
+ *                                                                              expected to be non-negative)
+ * \param[in, out]  pos_start_dup_seq   - start position of the duplicated seq (excluding itself)
+ * \param[in, out]  pos_end_dup_seq     - end position of the duplicated seq (including itself)
+ * \param[in, out]  tandem_dup_count    - the number of tandem copies of the inserted sequence
+ * \returns         duplicated_bases    - the duplicated bases of the duplication
+ *
+ * \details If the inserted bases include one or more copies of a duplicated sequence, which is suffix or prefix of
+ *          another copy, we have a Tandem Duplication. To check this, we have to do a semi-global alignment.
+ *
+ *          Case 1: The duplication (insertion) comes after the matched sequence. Thus we need to check if the inserted
+ *                  sequence matches (partly) the suffix sequence. The length of the matching part yields to the amount
+ *                  of copies, thus we can calculate the tandem_dup_count and the inserted_bases.
+ *
+ *                  ref AAAACCGCGTAGCGGG----------TACGTAACGGTACG
+ *                        ||||||||||||||          |||||||| -> inserted sequence: GCGGGGCGGG
+ *                  read  AACCGCGTAGCGGGGCGGGGCGGGTACGTAAC
+ *
+ *                  suffix_sequence AAAACCGCGTAGCGGG       -> free_end_gaps_sequence1_leading{true},
+ *                                             |||||          free_end_gaps_sequence1_trailing{false}
+ *                  inserted_bases             GCGGGGCGGG  -> free_end_gaps_sequence2_leading{false},
+ *                                                            free_end_gaps_sequence2_trailing{true}
+ *                  -> tandem_dup_count = 3, duplicated_bases = GCGGG
+ *
+ *          Case 2: The duplication (insertion) comes before the matched sequence.
+ *                  ref AAAACCGCGTA----------GCGGGTACGTAACGGTACG
+ *                        |||||||||          |||||||||||||  -> inserted sequence: GCGGGGCGGG
+ *                  read  AACCGCGTAGCGGGGCGGGGCGGGTACGTAAC
+ *
+ *                  prefix_sequence     GCGGGTACGTAACGGTACG -> free_end_gaps_sequence1_leading{false},
+ *                                      |||||                  free_end_gaps_sequence1_trailing{true}
+ *                  inserted_bases GCGGGGCGGG               -> free_end_gaps_sequence2_leading{true},
+ *                                                             free_end_gaps_sequence2_trailing{false}
+ *                  -> tandem_dup_count = 3, duplicated_bases = GCGGG
+ * \see For some complex examples see detection_test.cpp.
+ */
+std::span<seqan3::dna5 const> detect_tandem_duplication(seqan3::dna5_vector const & query_sequence,
+                                                        int32_t length,
+                                                        int32_t pos_ref,
+                                                        int32_t pos_read,
+                                                        std::span<seqan3::dna5 const> & inserted_bases,
+                                                        int32_t const min_length,
+                                                        int32_t & pos_start_dup_seq,
+                                                        int32_t & pos_end_dup_seq,
+                                                        size_t & tandem_dup_count);
+
 /*! \brief This function steps through the CIGAR string and stores junctions with their position in reference and read.
  *
  * \param[in]       read_name       - QNAME field of the SAM/BAM file
diff --git a/src/modules/sv_detection_methods/analyze_cigar_method.cpp b/src/modules/sv_detection_methods/analyze_cigar_method.cpp
index a85a9e37..0335e7a5 100644
--- a/src/modules/sv_detection_methods/analyze_cigar_method.cpp
+++ b/src/modules/sv_detection_methods/analyze_cigar_method.cpp
@@ -10,6 +10,34 @@
 using seqan3::operator""_cigar_operation;
 using seqan3::operator""_dna5;
 
+std::span<seqan3::dna5 const> detect_tandem_duplication([[maybe_unused]] seqan3::dna5_vector const & query_sequence,
+                                                        [[maybe_unused]] int32_t length,
+                                                        [[maybe_unused]] int32_t pos_ref,
+                                                        [[maybe_unused]] int32_t pos_read,
+                                                        std::span<seqan3::dna5 const> & inserted_bases,
+                                                        [[maybe_unused]] int32_t const min_length,
+                                                        [[maybe_unused]] int32_t & pos_start_dup_seq,
+                                                        [[maybe_unused]] int32_t & pos_end_dup_seq,
+                                                        [[maybe_unused]] size_t & tandem_dup_count)
+{
+    // Suffix Case:
+    // auto [ suffix_sequence_match_score, length_of_single_dupl_sequence_1 ] = align_suffix_or_prefix(...);
+
+    // Prefix Case:
+    // auto [ prefix_sequence_match_score, length_of_single_dupl_sequence_2 ] = align_suffix_or_prefix(...);
+
+    // int16_t length_of_single_dupl_sequence = std::min(length_of_single_dupl_sequence_1,
+    //                                                   length_of_single_dupl_sequence_2);
+    // tandem_dup_count = suffix_sequence_match_score / length_of_single_dupl_sequence     // duplicated part on ref
+    //                  + inserted_bases.size() / length_of_single_dupl_sequence           // inserted duplications
+    //                  + prefix_sequence_match_score / length_of_single_dupl_sequence;    // duplicated part on ref
+    // If its a duplication instead of an insertion, we save the (possible multiple times) duplicated part as duplicated_bases
+    std::span<seqan3::dna5 const> duplicated_bases{};
+    if (tandem_dup_count > 0)
+        duplicated_bases = (inserted_bases /*| seqan3::views::slice(0, length_of_single_dupl_sequence)*/);
+    return duplicated_bases;
+}
+
 void analyze_cigar(std::string const & read_name,
                    std::string const & chromosome,
                    int32_t const query_start_pos,
@@ -38,7 +66,8 @@ void analyze_cigar(std::string const & read_name,
             if (length >= min_length)
             {
                 // Insertions cause one junction from the insertion location to the next base
-                auto inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length);
+                std::span<seqan3::dna5 const> inserted_bases = query_sequence | seqan3::views::slice(pos_read,
+                                                                                                     pos_read + length);
         // ---------------- DUP ----------------
                 // Case - Duplication: The inserted bases include one or more copies of a duplicated sequence, with an
                 //                     origin somewhere else. -> global alignment
@@ -48,12 +77,20 @@ void analyze_cigar(std::string const & read_name,
                 size_t tandem_dup_count = 0;
                 int32_t pos_start_dup_seq{};
                 int32_t pos_end_dup_seq{};
-                // duplicated_bases = detect_tandem_duplication()
+                std::span<seqan3::dna5 const> duplicated_bases = detect_tandem_duplication(query_sequence,
+                                                                                           length,
+                                                                                           pos_ref,
+                                                                                           pos_read,
+                                                                                           inserted_bases,
+                                                                                           min_length,
+                                                                                           pos_start_dup_seq,
+                                                                                           pos_end_dup_seq,
+                                                                                           tandem_dup_count);
                 if (tandem_dup_count != 0)
                 {
                     Junction new_junction{Breakend{chromosome, pos_start_dup_seq, strand::forward},
                                           Breakend{chromosome, pos_end_dup_seq, strand::forward},
-                                          inserted_bases, // duplicated_bases,
+                                          duplicated_bases,
                                           tandem_dup_count,
                                           read_name};
                     if (gVerbose)

From 18f9a90b0825ae1e291ef5cd72af024ba011ba23 Mon Sep 17 00:00:00 2001
From: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
Date: Wed, 28 Jul 2021 13:17:59 +0200
Subject: [PATCH 3/4] [FEATURE] Get Tandem Duplications from CIGAR

Signed-off-by: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
---
 include/iGenVar.hpp                           |   1 +
 .../analyze_cigar_method.hpp                  |  36 +++++
 .../analyze_cigar_method.cpp                  | 133 +++++++++++++++---
 src/variant_detection/variant_output.cpp      |   6 +-
 test/cli/iGenVar_cli_test.cpp                 |   1 +
 test/data/datasources.cmake                   |   8 +-
 test/data/mini_example/output_err.txt         |  11 +-
 test/data/mini_example/output_res.txt         |   6 +-
 .../output_short_and_long_err.txt             |  42 ++++--
 .../output_short_and_long_res.txt             |  12 +-
 10 files changed, 220 insertions(+), 36 deletions(-)

diff --git a/include/iGenVar.hpp b/include/iGenVar.hpp
index a3474de7..39000f7c 100644
--- a/include/iGenVar.hpp
+++ b/include/iGenVar.hpp
@@ -26,6 +26,7 @@ struct cmd_arguments
     /* -c */ clustering_methods clustering_method{hierarchical_clustering};          // default: hierarchical clustering
     /* -r */ refinement_methods refinement_method{no_refinement};                    // default: no refinement
 // SV specifications:
+    /* -e TODO (irallia 19.08.21): duplication_errors */
     /* -k */ uint64_t min_var_length = 30;
     /* -l */ uint64_t max_var_length = 100000;
     /* -m */ uint64_t max_tol_inserted_length = 50;
diff --git a/include/modules/sv_detection_methods/analyze_cigar_method.hpp b/include/modules/sv_detection_methods/analyze_cigar_method.hpp
index c8b9cfec..79ffb00c 100644
--- a/include/modules/sv_detection_methods/analyze_cigar_method.hpp
+++ b/include/modules/sv_detection_methods/analyze_cigar_method.hpp
@@ -2,6 +2,42 @@
 
 #include "structures/junction.hpp"  // for class Junction
 
+/*! \brief This function checks if the inserted bases are tandem duplicated.
+ *
+ * \param[in] config                         - configuration for a semi-gobal alignment
+ * \param[in] min_length                     - minimum length of variants to detect (default 30 bp,
+ * \param[in] sequence                       - suffix or prefix sequence
+ * \param[in] inserted_bases                 - the inserted bases of the possible duplication
+ * \param[in] is_suffix                      - true: suffix case, false: prefix case
+ * param[out] match_score                    - if we found a matching duplication, this value represents the maximal
+ *                                             amount of matches with the reference and thus the length of the existing
+ *                                             duplicated part on the reference. If there is no duplication, this value
+ *                                             is 0.
+ * param[out] length_of_single_dupl_sequence - greatest common divisor of length of inserted sequence and length of
+ *                                             maching part -> length of a single duplicated sequence
+ * \returns std::tie(match_score, length_of_single_dupl_sequence) - a tuple of the resulting values
+ *
+ * \details In this function the inserted bases are recursively aligned segment by segment until it has been proven that
+ *          it is a real duplication.
+ *          For simplicity, we only consider the suffix case here, since the prefix case works the same way:
+ *
+ *          suffix_sequence AAAACCGCGTAGCGGGGCGGG
+ *                                     ||||||||||
+ *          inserted_bases             GCGGGGCGGGGCGGG  -> unmatched_inserted_bases: GCGGG
+ *                                                      -> match_score = 10
+ *                                                      -> length_of_single_dupl_sequence = gcd(15, 10) = 5
+ *
+ *          suffix_sequence AAAACCGCGTAGCGGGGCGGG
+ *                                          |||||
+ *          unmatched_inserted_bases        GCGGG
+ *
+ *          -> match_score = 5, length_of_single_dupl_sequence = gcd(15, 5) = 5
+ */
+std::tuple<size_t, size_t> align_suffix_or_prefix(auto const & config,
+                                                  int32_t const min_length,
+                                                  std::span<const seqan3::dna5> & sequence,
+                                                  std::span<const seqan3::dna5> & inserted_bases,
+                                                  bool is_suffix);
 
 /*! \brief This function checks if the inserted bases are tandem duplicated.
  *
diff --git a/src/modules/sv_detection_methods/analyze_cigar_method.cpp b/src/modules/sv_detection_methods/analyze_cigar_method.cpp
index 0335e7a5..7d78e99d 100644
--- a/src/modules/sv_detection_methods/analyze_cigar_method.cpp
+++ b/src/modules/sv_detection_methods/analyze_cigar_method.cpp
@@ -1,3 +1,11 @@
+#include <limits>   // for std::numeric_limits
+#include <numeric>  // for std::gcd (greatest common divisor)
+
+// #include <seqan3/alignment/aligned_sequence/debug_stream_alignment.hpp> // for a nicer alignment view
+#include <seqan3/alignment/configuration/align_config_method.hpp>
+#include <seqan3/alignment/configuration/align_config_scoring_scheme.hpp>
+#include <seqan3/alignment/pairwise/align_pairwise.hpp>
+#include <seqan3/alignment/scoring/nucleotide_scoring_scheme.hpp>
 #include <seqan3/alphabet/cigar/cigar.hpp>
 #include <seqan3/alphabet/nucleotide/dna5.hpp>
 #include <seqan3/core/debug_stream.hpp>
@@ -10,31 +18,116 @@
 using seqan3::operator""_cigar_operation;
 using seqan3::operator""_dna5;
 
-std::span<seqan3::dna5 const> detect_tandem_duplication([[maybe_unused]] seqan3::dna5_vector const & query_sequence,
-                                                        [[maybe_unused]] int32_t length,
-                                                        [[maybe_unused]] int32_t pos_ref,
-                                                        [[maybe_unused]] int32_t pos_read,
+std::tuple<size_t, size_t> align_suffix_or_prefix(auto const & config,
+                                                  int32_t const min_length,
+                                                  std::span<seqan3::dna5 const> & sequence,
+                                                  std::span<seqan3::dna5 const> & inserted_bases,
+                                                  bool is_suffix)
+{
+    // length of matching part of suffix or prefix sequence
+    size_t match_score{};
+    // The number of tandem copies of this junction.
+    size_t length_of_single_dupl_sequence = std::numeric_limits<size_t>::max();
+
+    auto results = seqan3::align_pairwise(std::tie(sequence, inserted_bases), config);
+    auto & res = *results.begin();
+    // TODO (irallia 17.8.21): The mismatches should give us the opportunity to allow a given amount of errors in the
+    // duplication.
+    size_t matches = res.score() % 100;
+    size_t mismatches = (res.score() - matches) * (-1);
+    // For the beginning we do not allow mistakes, we should change this later, see todo above.
+    if (mismatches > 0)
+        return std::tie(match_score, length_of_single_dupl_sequence);
+    // found duplicated sequence in front of inserted sequence
+    if (matches >= min_length)
+    {
+        // seqan3::debug_stream << "Resulting alignment:\n" << res.alignment() << '\n';
+        match_score = matches;
+        // The possible length of the single duplicated: greatest common divisor of length of inserted part and length
+        // of maching part
+        length_of_single_dupl_sequence = std::gcd(inserted_bases.size(), matches);
+        if(matches != inserted_bases.size())
+        {
+            std::span<seqan3::dna5 const> unmatched_inserted_bases{};
+            if (is_suffix)
+                unmatched_inserted_bases = inserted_bases | seqan3::views::slice(match_score, inserted_bases.size());
+            else
+                unmatched_inserted_bases = inserted_bases | seqan3::views::slice(0, inserted_bases.size() - match_score);
+            // The first length_of_single_dupl_sequence is already the greatest common divisor and therefore the next
+            // recursively calculated one can be ignored.
+            auto [ next_match_score,
+                   next_length_of_single_dupl_sequence ] = align_suffix_or_prefix(config,
+                                                                                  min_length,
+                                                                                  sequence,
+                                                                                  unmatched_inserted_bases,
+                                                                                  is_suffix);
+            // If the substring does not match, there is no real duplication.
+            if (next_match_score == 0) {
+                length_of_single_dupl_sequence = std::numeric_limits<size_t>::max();
+                return std::tie(next_match_score, length_of_single_dupl_sequence);
+            }
+        }
+    }
+    // The first match_score calculated is automatically the maximum, so the recursively next one can be ignored.
+    return std::tie(match_score, length_of_single_dupl_sequence);
+}
+
+std::span<seqan3::dna5 const> detect_tandem_duplication(seqan3::dna5_vector const & query_sequence,
+                                                        int32_t length,
+                                                        int32_t pos_ref,
+                                                        int32_t pos_read,
                                                         std::span<seqan3::dna5 const> & inserted_bases,
-                                                        [[maybe_unused]] int32_t const min_length,
-                                                        [[maybe_unused]] int32_t & pos_start_dup_seq,
-                                                        [[maybe_unused]] int32_t & pos_end_dup_seq,
-                                                        [[maybe_unused]] size_t & tandem_dup_count)
+                                                        int32_t const min_length,
+                                                        int32_t & pos_start_dup_seq,
+                                                        int32_t & pos_end_dup_seq,
+                                                        size_t & tandem_dup_count)
 {
+    auto scoring_scheme = seqan3::align_cfg::scoring_scheme{
+                              seqan3::nucleotide_scoring_scheme{seqan3::match_score{1},
+                                                                seqan3::mismatch_score{-100}}};
+
+    auto gap_scheme = seqan3::align_cfg::gap_cost_affine{seqan3::align_cfg::open_score{0},
+                                                         seqan3::align_cfg::extension_score{-100}};
     // Suffix Case:
-    // auto [ suffix_sequence_match_score, length_of_single_dupl_sequence_1 ] = align_suffix_or_prefix(...);
+    auto suffix_config = seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{true},
+                                                          seqan3::align_cfg::free_end_gaps_sequence2_leading{false},
+                                                          seqan3::align_cfg::free_end_gaps_sequence1_trailing{false},
+                                                          seqan3::align_cfg::free_end_gaps_sequence2_trailing{true}} |
+                         scoring_scheme | gap_scheme;
 
+    std::span<seqan3::dna5 const> suffix_sequence = query_sequence | seqan3::views::slice(0, pos_read);
+    auto [ suffix_sequence_match_score, length_of_single_dupl_sequence_1 ] = align_suffix_or_prefix(suffix_config,
+                                                                                                    min_length,
+                                                                                                    suffix_sequence,
+                                                                                                    inserted_bases,
+                                                                                                    true);
     // Prefix Case:
-    // auto [ prefix_sequence_match_score, length_of_single_dupl_sequence_2 ] = align_suffix_or_prefix(...);
+    auto prefix_config = seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{false},
+                                                          seqan3::align_cfg::free_end_gaps_sequence2_leading{true},
+                                                          seqan3::align_cfg::free_end_gaps_sequence1_trailing{true},
+                                                          seqan3::align_cfg::free_end_gaps_sequence2_trailing{false}} |
+                         scoring_scheme | gap_scheme;
 
-    // int16_t length_of_single_dupl_sequence = std::min(length_of_single_dupl_sequence_1,
-    //                                                   length_of_single_dupl_sequence_2);
-    // tandem_dup_count = suffix_sequence_match_score / length_of_single_dupl_sequence     // duplicated part on ref
-    //                  + inserted_bases.size() / length_of_single_dupl_sequence           // inserted duplications
-    //                  + prefix_sequence_match_score / length_of_single_dupl_sequence;    // duplicated part on ref
+    std::span<seqan3::dna5 const> prefix_sequence = query_sequence | seqan3::views::slice(pos_read + length,
+                                                                                          query_sequence.size());
+    auto [ prefix_sequence_match_score, length_of_single_dupl_sequence_2 ] = align_suffix_or_prefix(prefix_config,
+                                                                                                    min_length,
+                                                                                                    prefix_sequence,
+                                                                                                    inserted_bases,
+                                                                                                    false);
+
+    pos_start_dup_seq = pos_ref - 1 - suffix_sequence_match_score;
+    pos_end_dup_seq = pos_ref - 1 + prefix_sequence_match_score;
+
+    int16_t length_of_single_dupl_sequence = std::min(length_of_single_dupl_sequence_1,
+                                                      length_of_single_dupl_sequence_2);
+    tandem_dup_count = suffix_sequence_match_score / length_of_single_dupl_sequence     // duplicated part on ref
+                     + inserted_bases.size() / length_of_single_dupl_sequence           // inserted duplications
+                     + prefix_sequence_match_score / length_of_single_dupl_sequence;    // duplicated part on ref
     // If its a duplication instead of an insertion, we save the (possible multiple times) duplicated part as duplicated_bases
     std::span<seqan3::dna5 const> duplicated_bases{};
     if (tandem_dup_count > 0)
-        duplicated_bases = (inserted_bases /*| seqan3::views::slice(0, length_of_single_dupl_sequence)*/);
+        duplicated_bases = (inserted_bases | seqan3::views::slice(0, length_of_single_dupl_sequence));
     return duplicated_bases;
 }
 
@@ -69,6 +162,7 @@ void analyze_cigar(std::string const & read_name,
                 std::span<seqan3::dna5 const> inserted_bases = query_sequence | seqan3::views::slice(pos_read,
                                                                                                      pos_read + length);
         // ---------------- DUP ----------------
+
                 // Case - Duplication: The inserted bases include one or more copies of a duplicated sequence, with an
                 //                     origin somewhere else. -> global alignment
                 // ##ALT=<ID=DUP,Description="Duplication">
@@ -94,7 +188,11 @@ void analyze_cigar(std::string const & read_name,
                                           tandem_dup_count,
                                           read_name};
                     if (gVerbose)
+                    {
                         seqan3::debug_stream << "DUP:TANDEM: " << new_junction << "\n";
+                        seqan3::debug_stream << "\t\t\tduplicated sequence: " << duplicated_bases
+                                             << " with " << tandem_dup_count << " duplications\n";
+                    }
                     junctions.push_back(std::move(new_junction));
                 } else {
         // ---------------- INS ----------------
@@ -104,7 +202,10 @@ void analyze_cigar(std::string const & read_name,
                                           0,
                                           read_name};
                     if (gVerbose)
+                    {
                         seqan3::debug_stream << "INS: " << new_junction << "\n";
+                        seqan3::debug_stream << "\t\t\tinserted sequence: " << inserted_bases << "\n";
+                    }
                     junctions.push_back(std::move(new_junction));
                 }
             }
diff --git a/src/variant_detection/variant_output.cpp b/src/variant_detection/variant_output.cpp
index 4d9ae2ae..0dbf5d31 100644
--- a/src/variant_detection/variant_output.cpp
+++ b/src/variant_detection/variant_output.cpp
@@ -32,6 +32,10 @@ void find_and_output_variants(std::map<std::string, int32_t> & references_length
                     {
                         int distance = mate2.position - mate1.position - 1;
                         int sv_length = insert_size - distance;
+                        // Distinguish SVLEN calculation for TANDEM:DUP
+                        if (clusters[i].get_common_tandem_dup_count() > 0)
+                            sv_length = distance + 1;
+
                         // The SVLEN is neither too short nor too long than specified by the user.
                         if (std::abs(sv_length) >= args.min_var_length &&
                             std::abs(sv_length) <= args.max_var_length)
@@ -46,7 +50,7 @@ void find_and_output_variants(std::map<std::string, int32_t> & references_length
                                 tmp.add_info("SVTYPE", "DUP");
                                 // Increment position by 1 because VCF is 1-based
                                 tmp.set_pos(mate1.position + 1);
-                                tmp.add_info("SVLEN", std::to_string(distance));
+                                tmp.add_info("SVLEN", std::to_string(sv_length));
                                 // Increment end by 1 because VCF is 1-based
                                 tmp.add_info("END", std::to_string(mate2.position + 1));
                                 tmp.print(out_stream);
diff --git a/test/cli/iGenVar_cli_test.cpp b/test/cli/iGenVar_cli_test.cpp
index 6f29309a..371f88ca 100644
--- a/test/cli/iGenVar_cli_test.cpp
+++ b/test/cli/iGenVar_cli_test.cpp
@@ -197,6 +197,7 @@ TEST_F(iGenVar_cli_test, test_verbose_option)
     {
         "Detect junctions in long reads...\n"
         "INS: chr21\t41972615\tForward\tchr21\t41972616\tForward\t1681\t0\tm2257/8161/CCS\n"
+        "\t\t\tinserted sequence: GAGTGGACCTCAGCAAACTCCCAGTAGAGCTGCAGCAGAGGGGGTCTGACTGTTAGGATGGAAAACTAACAAACAGAAGGCAATAGCATCAACAACAACAAAAAAAAACTGCCACACAAAAACCGCTATCCGAAGATCACCAACATCAAAGATCGAACAGGTAGACAAATGACGAAGAGAGGAAAAAACAGTGCAAAAAAGGCTGAAAAGCCCAGAACCACCTCTTCCCTCCAGAGGATCACAACTCCTCACCAGGAAAGGGAACAAAACTGCACAGAGAAGAGTTTGACCAATGACAGAAGTAGGCTTCAGCAGAATGGGTAATAACTCCTCTGAGCTAAAGGAGCATGTTCCTACCCTAATGCAAGGAAGCTAAAGATACTTGATAAAAGGTTACAGGAACTGCTAACTAAATAAACCAGTTCAGAGAAGAACATAAATGACCTAAATGGAGCTGAAAAACACAGCATGAGAACTTCATGAAGGAATACACAAGGTATCAACAGACAAGTCCATCAGGCAGAAGAAAGGGATATCAGAGATTGAAGATCAACTTAATGAAATAAAGCATGCAAGACGAGATTAGTGAGAAAAAAGAATTAAAAGAAATGAGCAAAGGCCTCAAGGAAATATGGGACTATGTGTAAAAGACCAAGCATACGGTTTGATTGGTGTATGTGAAAATGACAGGGAAAATGGAACCAAGTTGGAAAACACTCTTCAGGATATCATGCAGGAGAACCTCCCAACCTAGCAAGAGAAGCCAACATTCACATTCAGGAAATACAAGAGAACACCACCAAGATACTCCTTGAGAAGTAGCAAACCCCCAAGACACATAATTGTTCAGATTCAGGCAAGGGTGAAAATGAAGGAAAAAATGCTAAGAGAGCCAGAGAGAAAGGTATGGGTTATCCACAAAAGGGCCAGCCATCAGACTAAGAGCAATTCTCTGCAGAAACCCTACAACCAGAAGAGAGAAGGGGCCAATATTCAACATTCTTAAAGAAAAGAATTTTCAACCCAGAATTTCATATCCAGCCAAAACAAAGCTTCGTAAGTGAAGGAGAAATAAATTTCTTTACAGACAAGCAAATGACTGAAGAAGATTTTTGTCACCACCATGCCTGCCTTACAAGATCTCCTGAAGGAAGCACTAAGACATGGGAAGGAAAAAATCCAGTACCAAGCCACTGCTAAACCATACCAAAATGTAGAGACTCAATGCTTAGGATAGGAAACTGCATCAACTAGCAGTCAAAATAACCAGCTAGCATTCATAATGACAGGATCAAATTCAGACCACATACAATTATTAACCTTAAATGTAAATGGGCTAAATGCCGCAATTAAAAGACACATCACTGGCAAATTGGATAAAGAGTCAAGCCCAATCGGTGTGCTGTTATTCAAGGAGACACCACTCTCACGTGCAAGAGACACAGATAGGCTCGAAAATGAATAGGGATGAAGGAAGATTACCAAGCAAATGGAAAGCAAAAAAAAAAAAAGCAGGGGTTGCAAATCCTAGTCTCTGTAAAACACTACTTTAAACCAAGAAAGATCAAAAGAGACAAAGAGGGCTTTAATAATGGTAATAGGGGGATTAATTCAACAAGAAGAGTTAACTATCCTAAATATATATGCTGCCTAATACAGGCACACCCAGATTCATAAAGCA\n"
         "The read depth method for long reads is not yet implemented.\n"
         "BND: chr21\t41972615\tReverse\tchr22\t17458415\tReverse\t0\t0\tm41327/11677/CCS\n"
         "The read depth method for long reads is not yet implemented.\n"
diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake
index 340e0718..a26ccb6c 100644
--- a/test/data/datasources.cmake
+++ b/test/data/datasources.cmake
@@ -29,19 +29,19 @@ declare_datasource (FILE single_end_mini_example.sam
 # copies file to <build>/data/output_err.txt
 declare_datasource (FILE output_err.txt
                     URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_err.txt
-                    URL_HASH SHA256=606826366c63cf8ed09c0efddbc6a010dd3f9e670946f690cb1bc54d246e7fcd)
+                    URL_HASH SHA256=b4c832bbf50cf3b9893191caa4dcc811299ec9de0270f2470bbb22a509826d6a)
 
 # copies file to <build>/data/output_res.txt
 declare_datasource (FILE output_res.txt
                     URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_res.txt
-                    URL_HASH SHA256=71238d79330c9efcf61467c20cd998517bf755d64742f60cbbfe0a5399a6c063)
+                    URL_HASH SHA256=6ce03518859776b951b9246c0bdf62537484cdb120de752468597f21524523de)
 
 # copies file to <build>/data/output_short_and_long_err.txt
 declare_datasource (FILE output_short_and_long_err.txt
                     URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_short_and_long_err.txt
-                    URL_HASH SHA256=4edfef05a26c876d51fa9ecabab0ef03ce52c6e71206eb96dd5e1751a852439c)
+                    URL_HASH SHA256=8dcede8d8b733e1ac9eb2b327c5d14d890d0c6f4e77c95ec8bf9e0d10a593a2e)
 
 # copies file to <build>/data/output_short_and_long_res.txt
 declare_datasource (FILE output_short_and_long_res.txt
                     URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_short_and_long_res.txt
-                    URL_HASH SHA256=55262c77c479cd4d767a43a03d4afd1528d722097de8a81d9e52805b37671e01)
+                    URL_HASH SHA256=d17b7b163bdbbb2c36fab6ca8a6282645e1ceaf240f66c183c56ccb307d59b90)
diff --git a/test/data/mini_example/output_err.txt b/test/data/mini_example/output_err.txt
index 197e86d8..df940637 100644
--- a/test/data/mini_example/output_err.txt
+++ b/test/data/mini_example/output_err.txt
@@ -11,14 +11,18 @@ DEL: chr1	56	Forward	chr1	70	Forward	0	0	read018
 DUP:TANDEM: chr1	109	Forward	chr1	124	Forward	0	2	read021
 2 segments describe this tandem duplication. Its length on the read is 34 and a single duplicated part has a length of 16 => tandem_dup_count = 2
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read023
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read024
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read025
+			inserted sequence: CCCCGGGGCCAATTT
 BND: chr1	96	Forward	chr1	125	Forward	0	0	read027
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read029
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read030
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read031
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read031
+			duplicated sequence: ATATATTT with 2 duplications
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read033
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
 BND: chr1	180	Reverse	chr1	187	Reverse	0	0	read034
@@ -38,10 +42,15 @@ DEL: chr1	335	Forward	chr1	350	Forward	0	0	read043
 DEL: chr1	335	Forward	chr1	350	Forward	0	0	read044
 DEL: chr1	335	Forward	chr1	350	Forward	0	0	read045
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read046
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read047
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read048
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read049
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read050
+			inserted sequence: GGTAACGTGTA
 DEL: chr1	383	Forward	chr1	395	Forward	0	0	read050
 BND: chr1	10	Reverse	chr1	470	Reverse	0	0	read051
 BND: chr1	10	Reverse	chr1	470	Reverse	0	0	read052
diff --git a/test/data/mini_example/output_res.txt b/test/data/mini_example/output_res.txt
index 86570a17..f75bff9e 100644
--- a/test/data/mini_example/output_res.txt
+++ b/test/data/mini_example/output_res.txt
@@ -12,12 +12,12 @@
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	MYSAMPLE
 chr1	57	.	N	<DEL>	9	PASS	END=70;SVLEN=-13;SVTYPE=DEL	GT	./.
 chr1	97	.	N	<DEL>	1	PASS	END=125;SVLEN=-28;SVTYPE=DEL	GT	./.
-chr1	110	.	N	<DUP:TANDEM>	1	PASS	END=125;SVLEN=14;SVTYPE=DUP	GT	./.
+chr1	110	.	N	<DUP:TANDEM>	1	PASS	END=125;SVLEN=15;SVTYPE=DUP	GT	./.
 chr1	125	.	N	<INS>	3	PASS	END=125;SVLEN=15;SVTYPE=INS	GT	./.
-chr1	180	.	N	<INS>	1	PASS	END=180;SVLEN=8;SVTYPE=INS	GT	./.
+chr1	180	.	N	<DUP:TANDEM>	1	PASS	END=188;SVLEN=8;SVTYPE=DUP	GT	./.
 chr1	266	.	N	<DEL>	4	PASS	END=286;SVLEN=-20;SVTYPE=DEL	GT	./.
 chr1	282	.	N	<DEL>	1	PASS	END=299;SVLEN=-17;SVTYPE=DEL	GT	./.
 chr1	336	.	N	<DEL>	4	PASS	END=350;SVLEN=-14;SVTYPE=DEL	GT	./.
 chr1	368	.	N	<INS>	5	PASS	END=368;SVLEN=11;SVTYPE=INS	GT	./.
 chr1	384	.	N	<DEL>	1	PASS	END=395;SVLEN=-11;SVTYPE=DEL	GT	./.
-chr1	512	.	N	<DUP:TANDEM>	1	PASS	END=529;SVLEN=16;SVTYPE=DUP	GT	./.
+chr1	512	.	N	<DUP:TANDEM>	1	PASS	END=529;SVLEN=17;SVTYPE=DUP	GT	./.
diff --git a/test/data/mini_example/output_short_and_long_err.txt b/test/data/mini_example/output_short_and_long_err.txt
index 36900cef..9d6a5ef0 100644
--- a/test/data/mini_example/output_short_and_long_err.txt
+++ b/test/data/mini_example/output_short_and_long_err.txt
@@ -9,22 +9,39 @@ DEL: chr1	56	Forward	chr1	70	Forward	0	0	read016
 DEL: chr1	56	Forward	chr1	70	Forward	0	0	read017
 DEL: chr1	56	Forward	chr1	70	Forward	0	0	read018
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read001
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read002
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read003
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read004
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read005
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read006
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read007
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read008
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read009
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read010
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read011
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read013
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read014
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read015
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read016
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read017
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read018
+			inserted sequence: CCCCGGGGCCAATTT
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read013
+			duplicated sequence: ATATATTT with 2 duplications
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read014
+			duplicated sequence: ATATATTT with 2 duplications
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read015
+			duplicated sequence: ATATATTT with 2 duplications
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read016
+			duplicated sequence: ATATATTT with 2 duplications
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read017
+			duplicated sequence: ATATATTT with 2 duplications
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read018
+			duplicated sequence: ATATATTT with 2 duplications
 Detect junctions in long reads...
 DEL: chr1	56	Forward	chr1	70	Forward	0	0	read010
 DEL: chr1	56	Forward	chr1	70	Forward	0	0	read011
@@ -38,14 +55,18 @@ DEL: chr1	56	Forward	chr1	70	Forward	0	0	read018
 DUP:TANDEM: chr1	109	Forward	chr1	124	Forward	0	2	read021
 2 segments describe this tandem duplication. Its length on the read is 34 and a single duplicated part has a length of 16 => tandem_dup_count = 2
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read023
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read024
+			inserted sequence: CCCCGGGGCCAATTT
 INS: chr1	124	Forward	chr1	125	Forward	15	0	read025
+			inserted sequence: CCCCGGGGCCAATTT
 BND: chr1	96	Forward	chr1	125	Forward	0	0	read027
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read029
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read030
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
-INS: chr1	179	Forward	chr1	180	Forward	8	0	read031
+DUP:TANDEM: chr1	179	Forward	chr1	187	Forward	8	2	read031
+			duplicated sequence: ATATATTT with 2 duplications
 DUP:TANDEM: chr1	180	Forward	chr1	187	Forward	0	2	read033
 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2
 BND: chr1	180	Reverse	chr1	187	Reverse	0	0	read034
@@ -65,10 +86,15 @@ DEL: chr1	335	Forward	chr1	350	Forward	0	0	read043
 DEL: chr1	335	Forward	chr1	350	Forward	0	0	read044
 DEL: chr1	335	Forward	chr1	350	Forward	0	0	read045
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read046
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read047
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read048
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read049
+			inserted sequence: GGTAACGTGTA
 INS: chr1	367	Forward	chr1	368	Forward	11	0	read050
+			inserted sequence: GGTAACGTGTA
 DEL: chr1	383	Forward	chr1	395	Forward	0	0	read050
 BND: chr1	10	Reverse	chr1	470	Reverse	0	0	read051
 BND: chr1	10	Reverse	chr1	470	Reverse	0	0	read052
@@ -77,5 +103,5 @@ DUP:TANDEM: chr1	511	Forward	chr1	528	Forward	0	2	read053
 2 segments describe this tandem duplication. Its length on the read is 39 and a single duplicated part has a length of 18 => tandem_dup_count = 2
 INS: chr1	508	Reverse	chr1	528	Reverse	6	0	read054
 Start clustering...
-Done with clustering. Found 18 junction clusters.
+Done with clustering. Found 24 junction clusters.
 No refinement was selected.
diff --git a/test/data/mini_example/output_short_and_long_res.txt b/test/data/mini_example/output_short_and_long_res.txt
index 315d7459..0878792d 100644
--- a/test/data/mini_example/output_short_and_long_res.txt
+++ b/test/data/mini_example/output_short_and_long_res.txt
@@ -12,12 +12,18 @@
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	MYSAMPLE
 chr1	57	.	N	<DEL>	18	PASS	END=70;SVLEN=-13;SVTYPE=DEL	GT	./.
 chr1	97	.	N	<DEL>	1	PASS	END=125;SVLEN=-28;SVTYPE=DEL	GT	./.
-chr1	110	.	N	<DUP:TANDEM>	1	PASS	END=125;SVLEN=14;SVTYPE=DUP	GT	./.
+chr1	110	.	N	<DUP:TANDEM>	1	PASS	END=125;SVLEN=15;SVTYPE=DUP	GT	./.
 chr1	125	.	N	<INS>	14	PASS	END=125;SVLEN=15;SVTYPE=INS	GT	./.
-chr1	180	.	N	<INS>	7	PASS	END=180;SVLEN=8;SVTYPE=INS	GT	./.
+chr1	180	.	N	<DUP:TANDEM>	1	PASS	END=188;SVLEN=8;SVTYPE=DUP	GT	./.
+chr1	180	.	N	<DUP:TANDEM>	1	PASS	END=188;SVLEN=8;SVTYPE=DUP	GT	./.
+chr1	180	.	N	<DUP:TANDEM>	1	PASS	END=188;SVLEN=8;SVTYPE=DUP	GT	./.
+chr1	180	.	N	<DUP:TANDEM>	1	PASS	END=188;SVLEN=8;SVTYPE=DUP	GT	./.
+chr1	180	.	N	<DUP:TANDEM>	1	PASS	END=188;SVLEN=8;SVTYPE=DUP	GT	./.
+chr1	180	.	N	<DUP:TANDEM>	1	PASS	END=188;SVLEN=8;SVTYPE=DUP	GT	./.
+chr1	180	.	N	<DUP:TANDEM>	1	PASS	END=188;SVLEN=8;SVTYPE=DUP	GT	./.
 chr1	266	.	N	<DEL>	4	PASS	END=286;SVLEN=-20;SVTYPE=DEL	GT	./.
 chr1	282	.	N	<DEL>	1	PASS	END=299;SVLEN=-17;SVTYPE=DEL	GT	./.
 chr1	336	.	N	<DEL>	4	PASS	END=350;SVLEN=-14;SVTYPE=DEL	GT	./.
 chr1	368	.	N	<INS>	5	PASS	END=368;SVLEN=11;SVTYPE=INS	GT	./.
 chr1	384	.	N	<DEL>	1	PASS	END=395;SVLEN=-11;SVTYPE=DEL	GT	./.
-chr1	512	.	N	<DUP:TANDEM>	1	PASS	END=529;SVLEN=16;SVTYPE=DUP	GT	./.
+chr1	512	.	N	<DUP:TANDEM>	1	PASS	END=529;SVLEN=17;SVTYPE=DUP	GT	./.

From 4a44217453c3e8e2c753258b9e6d5640fffa0999 Mon Sep 17 00:00:00 2001
From: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
Date: Tue, 10 Aug 2021 16:59:27 +0200
Subject: [PATCH 4/4] [TEST] Add some test cases

Signed-off-by: Lydia Buntrock <lydia.buntrock@fu-berlin.de>
---
 test/api/detection_test.cpp  | 149 +++++++++++++++++++++++++++++++++++
 test/api/structures_test.cpp |  43 +++++++++-
 2 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/test/api/detection_test.cpp b/test/api/detection_test.cpp
index 459c76e6..4a1b612f 100644
--- a/test/api/detection_test.cpp
+++ b/test/api/detection_test.cpp
@@ -90,6 +90,155 @@ TEST(junction_detection, cigar_string_del_padding)
     }
 }
 
+TEST(junction_detection, cigar_string_dup_detect_tandem_duplication)
+{
+    seqan3::dna5_vector const & query_sequence = {"AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC"_dna5};
+    int32_t length = 15;
+    int32_t pos_start_dup_seq = 0;
+    int32_t pos_end_dup_seq = 0;
+    size_t tandem_dup_count = 0;
+
+    // Prefix Case: The duplication (insertion) comes after the matched sequence.
+    //                            (10,                   20]          -> pos_start_dup_seq, pos_end_dup_seq
+    //              ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG
+    //                    |||||||||||||||||||               ||||||||
+    //              read  AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC  -> inserted sequence: GCGGGGCGGGGCGGG
+    //
+    //              suffix_sequence AAAACCGCGTAGCGGGGCGGG
+    //                                         ||||||||||
+    //              inserted sequence:         GCGGGGCGGGGCGGG
+    //              -> tandem_dup_count = 5, inserted_bases = GCGGG
+    int32_t pos_ref = 21;
+    int32_t pos_read = 19;
+    std::span<const seqan3::dna5> inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); // "GCGGGGCGGG"_dna5
+
+    auto duplicated_bases = detect_tandem_duplication(query_sequence,
+                                                      length,
+                                                      pos_ref,
+                                                      pos_read,
+                                                      inserted_bases,
+                                                      4,    // minimal length of duplication
+                                                      pos_start_dup_seq,
+                                                      pos_end_dup_seq,
+                                                      tandem_dup_count);
+    EXPECT_EQ(tandem_dup_count, 5);
+    EXPECT_EQ(pos_start_dup_seq, 10);
+    EXPECT_EQ(pos_end_dup_seq, 20);
+
+    // Suffix Case: The duplication (insertion) comes before the matched sequence.
+    //                            (10,                   20]          -> pos_start_dup_seq, pos_end_dup_seq
+    //              ref AAAACCGCGTA---------------GCGGGGCGGGTACGTAACGGTACG
+    //                    |||||||||               ||||||||||||||||||
+    //              read  AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC  -> inserted sequence: GCGGGGCGGGGCGGG
+    //
+    //              prefix_sequence         GCGGGGCGGGTACGTAACGGTACG
+    //                                      ||||||||||
+    //              inserted sequence: GCGGGGCGGGGCGGG
+    //              -> tandem_dup_count = 5, inserted_bases = GCGGG
+    pos_ref = 11;   // position of first inserted base.
+    pos_read = 9;
+    inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length);
+    duplicated_bases = detect_tandem_duplication(query_sequence,
+                                                 length,
+                                                 pos_ref,
+                                                 pos_read,
+                                                 inserted_bases,
+                                                 4,     // minimal length of duplication
+                                                 pos_start_dup_seq,
+                                                 pos_end_dup_seq,
+                                                 tandem_dup_count);
+    EXPECT_EQ(tandem_dup_count, 5);
+    EXPECT_EQ(pos_start_dup_seq, 10);
+    EXPECT_EQ(pos_end_dup_seq, 20);
+
+    // Special Case: The duplication (insertion) is in between of matching sequences.
+    //                            (10,                   25]          -> pos_start_dup_seq, pos_end_dup_seq
+    //              ref AAAACCGCGTAGCGGG----------GCGGGGCGGGTACGTAACGGTACG
+    //                    ||||||||||||||          |||||||||||||
+    //              read  AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC  -> inserted sequence: GCGGGGCGGG
+    //
+    //              suffix_sequence AAAACCGCGTAGCGGG  prefix_sequence GCGGGGCGGGTACGTAACGGTACG
+    //                                         |||||                  ||||||||||
+    //              inserted sequence:         GCGGGGCGGG             GCGGGGCGGG
+    //              -> tandem_dup_count = 5, inserted_bases = GCGGG
+    length = 10;
+    pos_ref = 16;
+    pos_read = 14;
+    inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); // "GCGGG"_dna5
+
+    duplicated_bases = detect_tandem_duplication(query_sequence,
+                                                 length,
+                                                 pos_ref,
+                                                 pos_read,
+                                                 inserted_bases,
+                                                 4,     // minimal length of duplication
+                                                 pos_start_dup_seq,
+                                                 pos_end_dup_seq,
+                                                 tandem_dup_count);
+    EXPECT_EQ(tandem_dup_count, 5);
+    EXPECT_EQ(pos_start_dup_seq, 10);
+    EXPECT_EQ(pos_end_dup_seq, 25);
+
+    // Case with errors at the end: The duplication (insertion) includes some errors at the end.
+    //                            (10,                   20]          -> pos_start_dup_seq, pos_end_dup_seq
+    //              ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG
+    //                    |||||||||||||||||||               ||||||||
+    //              read  AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCAAATACGTAAC  -> inserted sequence: GCGGGGCGGGGCAAA
+    //
+    //              suffix_sequence AAAACCGCGTAGCGGGGCGGG
+    //                                         ||||||||||
+    //              inserted sequence:         GCGGGGCGGGGCAAA
+    //
+    //              suffix_sequence AAAACCGCGTAGCGGGGCGGG
+    //                                              ||---             -> subsequence does not match without mismatches
+    //              inserted sequence:              GCAAA
+    //
+    seqan3::dna5_vector const & query_sequence_2 = {"AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCAAATACGTAAC"_dna5};
+    length = 15;
+    pos_ref = 21;
+    pos_read = 19;
+    inserted_bases = query_sequence_2 | seqan3::views::slice(pos_read, pos_read + length); // "GCGGGGCGGGGCAAA"_dna5
+
+    duplicated_bases = detect_tandem_duplication(query_sequence_2,
+                                                 length,
+                                                 pos_ref,
+                                                 pos_read,
+                                                 inserted_bases,
+                                                 4,    // minimal length of duplication
+                                                 pos_start_dup_seq,
+                                                 pos_end_dup_seq,
+                                                 tandem_dup_count);
+    EXPECT_EQ(tandem_dup_count, 0);
+    EXPECT_EQ(pos_start_dup_seq, 20);
+    EXPECT_EQ(pos_end_dup_seq, 20);
+
+    // Case with a single error: The duplication (insertion) includes a single errors.
+    //                            (10,                   20]          -> pos_start_dup_seq, pos_end_dup_seq
+    //              ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG
+    //                    |||||||||||||||||||               ||||||||
+    //              read  AACCGCGTAGCGGGGCGGGGCGAGGCGGGGCGGGTACGTAAC  -> inserted sequence: GCGAGGCGGGGCGGG
+    //
+    //              suffix_sequence AAAACCGCGTAGCGGGGCGGG
+    //                                         |||-||||||             -> single mismatch in the alignment
+    //              inserted sequence:         GCGAGGCGGGGCGGG
+    //
+    seqan3::dna5_vector const & query_sequence_3 = {"AACCGCGTAGCGGGGCGGGGCGAGGCGGGGCGGGTACGTAAC"_dna5};
+    inserted_bases = query_sequence_3 | seqan3::views::slice(pos_read, pos_read + length); // "GCGAGGCGGGGCGGG"_dna5
+
+    duplicated_bases = detect_tandem_duplication(query_sequence_3,
+                                                 length,
+                                                 pos_ref,
+                                                 pos_read,
+                                                 inserted_bases,
+                                                 4,    // minimal length of duplication
+                                                 pos_start_dup_seq,
+                                                 pos_end_dup_seq,
+                                                 tandem_dup_count);
+    EXPECT_EQ(tandem_dup_count, 0);
+    EXPECT_EQ(pos_start_dup_seq, 20);
+    EXPECT_EQ(pos_end_dup_seq, 20);
+}
+
 TEST(junction_detection, cigar_string_simple_ins)
 {
     std::string const read_name = "read021";
diff --git a/test/api/structures_test.cpp b/test/api/structures_test.cpp
index 243e6b85..3ce22982 100644
--- a/test/api/structures_test.cpp
+++ b/test/api/structures_test.cpp
@@ -2,11 +2,14 @@
 
 #include "structures/aligned_segment.hpp"
 #include "structures/breakend.hpp"
+#include "structures/cluster.hpp"
+#include "structures/junction.hpp"
 #include "variant_detection/method_enums.hpp"
 
 /* tests for aligned_segment */
 
 using seqan3::operator""_cigar_operation;
+using seqan3::operator""_dna5;
 
 TEST(structures, aligned_segment)
 {
@@ -104,7 +107,7 @@ TEST(structures, method_enums_refinement_methods)
 
 /* tests for junctions */
 
-TEST(structures, breakend_flip_orientation)
+TEST(structures, junctions_breakend_flip_orientation)
 {
     Breakend forward_breakend{"chr1", 42, strand::forward};
     Breakend reverse_breakend{"chr1", 42, strand::reverse};
@@ -117,3 +120,41 @@ TEST(structures, breakend_flip_orientation)
     forward_breakend.flip_orientation();
     EXPECT_EQ(forward_breakend, reverse_breakend); // both are forward now
 }
+
+/* tests for clusters */
+
+TEST(structures, clusters_get_common_tandem_dup_count)
+{
+    Junction junction_1{Breakend{"chr1", 123456, strand::forward},
+                        Breakend{"chr1", 234567, strand::forward},
+                        ""_dna5, 0, "read_1"};
+    Junction junction_2{Breakend{"chr1", 123456, strand::forward},
+                        Breakend{"chr1", 234567, strand::forward},
+                        ""_dna5, 1, "read_1"};
+    Junction junction_3{Breakend{"chr1", 123456, strand::forward},
+                        Breakend{"chr1", 234567, strand::forward},
+                        ""_dna5, 2, "read_1"};
+    Junction junction_4{Breakend{"chr1", 123456, strand::forward},
+                        Breakend{"chr1", 234567, strand::forward},
+                        ""_dna5, 7, "read_1"};
+
+    // Tandem duplication with 2 copies:
+    Cluster cluster_1{{junction_3, junction_3, junction_3}};
+    EXPECT_EQ(cluster_1.get_common_tandem_dup_count(), 2);
+
+    // Tandem duplication with different amount of copies:
+    Cluster cluster_2{{junction_2, junction_3, junction_4}};
+    EXPECT_EQ(cluster_2.get_common_tandem_dup_count(), 3); // (1+2+7)/3=3,333
+
+    // More than two thirds of the junctions have a 0 tandem_dup_count, than its probably no tandem duplication.
+    Cluster cluster_3{{junction_1, junction_1, junction_1, junction_2}};
+    EXPECT_EQ(cluster_3.get_common_tandem_dup_count(), 0);
+
+    // Two thirds of the junctions have a 0 tandem_dup_count, than its probably a tandem duplication.
+    Cluster cluster_4{{junction_1, junction_1, junction_2}};
+    EXPECT_EQ(cluster_4.get_common_tandem_dup_count(), 0);
+
+    // Less than two thirds of the junctions have a 0 tandem_dup_count, than its probably a tandem duplication.
+    Cluster cluster_5{{junction_1, junction_2}};
+    EXPECT_EQ(cluster_5.get_common_tandem_dup_count(), 1);
+}