Skip to content

Commit

Permalink
fix: clusters expanded off contig edge
Browse files Browse the repository at this point in the history
 - fixes #28, where a plasmid variant occurred at position 17
 - generate_str() tried to expand a cluster off the edge of the contig
  • Loading branch information
TimD1 committed Jun 10, 2024
1 parent 92db7b5 commit 7ba7ed4
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 10 deletions.
18 changes: 13 additions & 5 deletions src/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -674,9 +674,10 @@ void simple_cluster(std::shared_ptr<variantData> vcf, int callset) {
/* Add single-VCF cluster indices to `variantData`. This version assumes that
* all variant calls are true positives (doesn't allow skipping)
*/
void wf_swg_cluster(variantData * vcf, std::string ctg,
void wf_swg_cluster(variantData * vcf, int ctg_idx,
int hap, int sub, int open, int extend) {
bool print = false;
std::string ctg = vcf->contigs[ctg_idx];

// allocate this memory once, use on each cluster
std::vector<int> offs_buffer(MATS * g.max_size*2 *
Expand Down Expand Up @@ -753,12 +754,13 @@ void wf_swg_cluster(variantData * vcf, std::string ctg,
}
}

// calculate alignment score
int l_reach = 0, r_reach = 0, score = 0;
if (left_compute || right_compute) {
int beg_idx = vars->clusters[clust];
int end_idx = vars->clusters[clust+1];
int beg = vars->poss[beg_idx] - 1;
int end = vars->poss[end_idx-1] + vars->rlens[end_idx-1] + 1;
int beg = std::max(0, vars->poss[beg_idx] - 1);
int end = std::min(vcf->lengths[ctg_idx], vars->poss[end_idx-1] + vars->rlens[end_idx-1] + 1);
std::string query = generate_str(vcf->ref, vars, ctg, beg_idx, end_idx, beg, end);
std::string ref = vcf->ref->fasta.at(ctg).substr(beg, end-beg);
std::vector< std::vector< std::vector<uint8_t> > > ptrs(MATS);
Expand Down Expand Up @@ -789,7 +791,7 @@ void wf_swg_cluster(variantData * vcf, std::string ctg,
int reach = ref_len - 1;
while (reach == ref_len-1) { // iterative doubling
ref_len *= 2;
beg_pos = end_pos - ref_len - std::abs(main_diag) - score/extend - 3; // ensure query is longer
beg_pos = std::max(0, end_pos - ref_len - std::abs(main_diag) - score/extend - 3); // ensure query is longer
query = generate_str(vcf->ref, vars, ctg,
vars->clusters[clust],
vars->clusters[clust+1],
Expand All @@ -810,6 +812,8 @@ void wf_swg_cluster(variantData * vcf, std::string ctg,
// reset buffer
for (size_t i = 0; i < offs_size; i++)
offs_buffer[i] = -2;
// exit if we've reached the start of the contig
if (beg_pos == 0) break;
}
if (print) printf(" left reach: %d\n", reach);
l_reach = end_pos - reach;
Expand Down Expand Up @@ -849,7 +853,8 @@ void wf_swg_cluster(variantData * vcf, std::string ctg,
int reach = ref_len - 1;
while (reach == ref_len-1) { // iterative doubling
ref_len *= 2;
end_pos = beg_pos + ref_len + std::abs(main_diag) + score/extend + 3;
end_pos = std::min(vcf->lengths[ctg_idx],
beg_pos + ref_len + std::abs(main_diag) + score/extend + 3);
query = generate_str(vcf->ref, vars, ctg,
vars->clusters[clust],
vars->clusters[clust+1],
Expand All @@ -868,6 +873,9 @@ void wf_swg_cluster(variantData * vcf, std::string ctg,
// reset buffer
for (size_t i = 0; i < offs_size; i++)
offs_buffer[i] = -2;
// exit if we've reached the end of the contig
if (end_pos == vcf->lengths[ctg_idx]) break;

}
if (print) printf(" right reach: %d\n", reach);
r_reach = beg_pos + reach + 1;
Expand Down
2 changes: 1 addition & 1 deletion src/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class superclusterData {

// for single haplotype clustering (one VCF)
void simple_cluster(std::shared_ptr<variantData> vcf, int callset);
void wf_swg_cluster(variantData * vcf, std::string ctg, int hap,
void wf_swg_cluster(variantData * vcf, int ctg_idx, int hap,
int sub, int open, int extend);
std::vector< std::vector< std::vector<int> > >
sort_superclusters(std::shared_ptr<superclusterData>);
Expand Down
8 changes: 4 additions & 4 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ int main(int argc, char **argv) {
std::vector<std::thread> threads;
for (int t = 0; t < HAPS*int(query_ptr->contigs.size()); t++) {
threads.push_back(std::thread( wf_swg_cluster,
query_ptr.get(), query_ptr->contigs[t/2], t%2, /* hap */
query_ptr.get(), t/2 /* contig */, t%2, /* hap */
g.sub, g.open, g.extend));
if ((t+1) % g.max_threads == 0) { // wait for thread batch to complete
for (std::thread & thread : threads) thread.join();
Expand Down Expand Up @@ -117,7 +117,7 @@ int main(int argc, char **argv) {
std::vector<std::thread> threads;
for (int t = 0; t < HAPS*int(query_ptr->contigs.size()); t++) {
threads.push_back(std::thread( wf_swg_cluster,
query_ptr.get(), query_ptr->contigs[t/2], t%2, /* hap */
query_ptr.get(), t/2 /* contig */, t%2, /* hap */
g.sub, g.open, g.extend));
if ((t+1) % g.max_threads == 0) { // wait for thread batch to complete
for (std::thread & thread : threads) thread.join();
Expand All @@ -144,7 +144,7 @@ int main(int argc, char **argv) {
std::vector<std::thread> threads;
for (int t = 0; t < HAPS*int(truth_ptr->contigs.size()); t++) {
threads.push_back(std::thread( wf_swg_cluster,
truth_ptr.get(), truth_ptr->contigs[t/2], t%2, /* hap */
truth_ptr.get(), t/2 /* contig */, t%2, /* hap */
g.sub, g.open, g.extend));
if ((t+1) % g.max_threads == 0) { // wait for thread batch to complete
for (std::thread & thread : threads) thread.join();
Expand Down Expand Up @@ -191,7 +191,7 @@ int main(int argc, char **argv) {
std::vector<std::thread> threads;
for (int t = 0; t < HAPS*int(truth_ptr->contigs.size()); t++) {
threads.push_back(std::thread( wf_swg_cluster,
truth_ptr.get(), truth_ptr->contigs[t/2], t%2, /* hap */
truth_ptr.get(), t/2 /* contig */, t%2, /* hap */
g.sub, g.open, g.extend));
if ((t+1) % g.max_threads == 0) { // wait for thread batch to complete
for (std::thread & thread : threads) thread.join();
Expand Down

0 comments on commit 7ba7ed4

Please sign in to comment.